celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/singlecell.py ADDED
@@ -0,0 +1,533 @@
1
+ """
2
+ Single-cell analysis tools: clustering, trajectory inference, cell type annotation.
3
+
4
+ Requires scanpy for computation. Gracefully returns install instructions if missing.
5
+ """
6
+
7
+ from ct.tools import registry
8
+
9
+
10
+ # Canonical marker gene panels for common cell types
11
+ MARKER_PANELS = {
12
+ "T cells": ["CD3D", "CD3E", "CD3G", "CD2", "TRAC"],
13
+ "CD4+ T cells": ["CD4", "IL7R", "CCR7", "LEF1"],
14
+ "CD8+ T cells": ["CD8A", "CD8B", "GZMK", "GZMB"],
15
+ "Regulatory T cells": ["FOXP3", "IL2RA", "CTLA4", "TIGIT"],
16
+ "B cells": ["CD79A", "CD79B", "MS4A1", "CD19", "PAX5"],
17
+ "Plasma cells": ["JCHAIN", "MZB1", "SDC1", "XBP1"],
18
+ "NK cells": ["NKG7", "GNLY", "KLRD1", "KLRF1", "NCAM1"],
19
+ "Monocytes": ["LYZ", "S100A8", "S100A9", "CD14", "FCGR3A"],
20
+ "Macrophages": ["CD68", "CD163", "MRC1", "MSR1", "MARCO"],
21
+ "Dendritic cells": ["FCER1A", "CLEC10A", "CD1C", "ITGAX"],
22
+ "Plasmacytoid DCs": ["LILRA4", "IRF7", "TCF4", "CLEC4C"],
23
+ "Neutrophils": ["CSF3R", "FCGR3B", "CXCR2", "S100A12"],
24
+ "Mast cells": ["KIT", "TPSAB1", "TPSB2", "CPA3"],
25
+ "Erythrocytes": ["HBA1", "HBA2", "HBB", "GYPA"],
26
+ "Platelets": ["PPBP", "PF4", "GP9", "ITGA2B"],
27
+ "Fibroblasts": ["DCN", "COL1A1", "COL1A2", "LUM", "PDGFRA"],
28
+ "Endothelial": ["PECAM1", "VWF", "CDH5", "ERG", "FLT1"],
29
+ "Epithelial": ["EPCAM", "KRT18", "KRT19", "CDH1"],
30
+ }
31
+
32
+
33
+ def _check_scanpy():
34
+ """Check if scanpy is installed and return it, or None."""
35
+ try:
36
+ import scanpy as sc
37
+ return sc
38
+ except ImportError:
39
+ return None
40
+
41
+
42
+ @registry.register(
43
+ name="singlecell.cluster",
44
+ description="Cluster single-cell RNA-seq data using Leiden/Louvain community detection with PCA and UMAP embedding",
45
+ category="singlecell",
46
+ parameters={
47
+ "data_path": "Path to h5ad or CSV file with single-cell expression data",
48
+ "resolution": "Clustering resolution (higher = more clusters, default 1.0)",
49
+ "method": "Clustering method: 'leiden' or 'louvain' (default 'leiden')",
50
+ },
51
+ usage_guide="You have single-cell RNA-seq data and need to identify cell populations. Run this first in any single-cell analysis workflow. Produces cluster assignments and UMAP coordinates for downstream annotation.",
52
+ )
53
+ def cluster(data_path: str, resolution: float = 1.0, method: str = "leiden", **kwargs) -> dict:
54
+ """Cluster single-cell data: load -> normalize -> PCA -> neighbors -> clustering -> UMAP.
55
+
56
+ Supports h5ad (AnnData) and CSV input formats. Returns cluster assignments,
57
+ top marker genes per cluster, and UMAP coordinate summary.
58
+ """
59
+ sc = _check_scanpy()
60
+ if sc is None:
61
+ return {
62
+ "error": "scanpy is required for single-cell clustering. Install with: pip install scanpy",
63
+ "summary": "scanpy not installed. Install with: pip install scanpy",
64
+ }
65
+
66
+ import numpy as np
67
+
68
+ # Load data
69
+ try:
70
+ if data_path.endswith(".h5ad"):
71
+ adata = sc.read_h5ad(data_path)
72
+ elif data_path.endswith(".csv"):
73
+ import pandas as pd
74
+ df = pd.read_csv(data_path, index_col=0)
75
+ from anndata import AnnData
76
+ adata = AnnData(df)
77
+ else:
78
+ return {
79
+ "error": f"Unsupported file format: {data_path}. Use .h5ad or .csv",
80
+ "summary": f"Cannot read {data_path} — expected .h5ad or .csv format",
81
+ }
82
+ except Exception as e:
83
+ return {
84
+ "error": f"Failed to load data: {e}",
85
+ "summary": f"Could not read single-cell data from {data_path}",
86
+ }
87
+
88
+ n_cells, n_genes = adata.shape
89
+
90
+ # Store raw counts for marker gene detection
91
+ adata.layers["counts"] = adata.X.copy()
92
+
93
+ # Standard preprocessing pipeline
94
+ sc.pp.normalize_total(adata, target_sum=1e4)
95
+ sc.pp.log1p(adata)
96
+
97
+ # Identify highly variable genes
98
+ if n_genes > 2000:
99
+ sc.pp.highly_variable_genes(adata, n_top_genes=min(2000, n_genes))
100
+ adata_hvg = adata[:, adata.var["highly_variable"]].copy()
101
+ else:
102
+ adata_hvg = adata.copy()
103
+
104
+ # Scale and PCA
105
+ sc.pp.scale(adata_hvg, max_value=10)
106
+ n_pcs = min(50, adata_hvg.shape[1] - 1, adata_hvg.shape[0] - 1)
107
+ sc.tl.pca(adata_hvg, n_comps=n_pcs)
108
+
109
+ # Transfer PCA to full adata
110
+ adata.obsm["X_pca"] = adata_hvg.obsm["X_pca"]
111
+
112
+ # Neighbors and clustering
113
+ n_neighbors = min(15, n_cells - 1)
114
+ sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
115
+
116
+ if method == "leiden":
117
+ try:
118
+ sc.tl.leiden(adata, resolution=resolution, key_added="cluster")
119
+ except Exception:
120
+ # Fall back to louvain if leiden not available
121
+ sc.tl.louvain(adata, resolution=resolution, key_added="cluster")
122
+ method = "louvain"
123
+ else:
124
+ sc.tl.louvain(adata, resolution=resolution, key_added="cluster")
125
+
126
+ # UMAP
127
+ sc.tl.umap(adata)
128
+
129
+ # Get cluster assignments
130
+ clusters = adata.obs["cluster"].astype(str)
131
+ n_clusters = clusters.nunique()
132
+ cluster_sizes = clusters.value_counts().to_dict()
133
+
134
+ # Find marker genes per cluster
135
+ try:
136
+ sc.tl.rank_genes_groups(adata, groupby="cluster", method="wilcoxon", layer="counts")
137
+ marker_genes = {}
138
+ for cl in sorted(clusters.unique(), key=lambda x: int(x) if x.isdigit() else x):
139
+ names = adata.uns["rank_genes_groups"]["names"][cl][:5]
140
+ scores = adata.uns["rank_genes_groups"]["scores"][cl][:5]
141
+ marker_genes[str(cl)] = [
142
+ {"gene": str(n), "score": round(float(s), 3)}
143
+ for n, s in zip(names, scores)
144
+ ]
145
+ except Exception:
146
+ marker_genes = {}
147
+
148
+ # UMAP summary statistics
149
+ umap_coords = adata.obsm["X_umap"]
150
+ umap_summary = {
151
+ "min_x": round(float(np.min(umap_coords[:, 0])), 3),
152
+ "max_x": round(float(np.max(umap_coords[:, 0])), 3),
153
+ "min_y": round(float(np.min(umap_coords[:, 1])), 3),
154
+ "max_y": round(float(np.max(umap_coords[:, 1])), 3),
155
+ }
156
+
157
+ # Build summary text
158
+ marker_str_parts = []
159
+ for cl in sorted(marker_genes.keys(), key=lambda x: int(x) if x.isdigit() else x)[:5]:
160
+ genes = ", ".join(m["gene"] for m in marker_genes[cl][:3])
161
+ marker_str_parts.append(f"cluster {cl}: {genes}")
162
+ marker_summary = "; ".join(marker_str_parts) if marker_str_parts else "N/A"
163
+
164
+ summary = (
165
+ f"Clustered {n_cells} cells into {n_clusters} clusters "
166
+ f"({method} r={resolution}). "
167
+ f"Top markers: {marker_summary}"
168
+ )
169
+
170
+ return {
171
+ "summary": summary,
172
+ "n_cells": n_cells,
173
+ "n_genes": n_genes,
174
+ "n_clusters": n_clusters,
175
+ "method": method,
176
+ "resolution": resolution,
177
+ "cluster_sizes": cluster_sizes,
178
+ "marker_genes": marker_genes,
179
+ "umap_summary": umap_summary,
180
+ }
181
+
182
+
183
+ @registry.register(
184
+ name="singlecell.trajectory",
185
+ description="Infer developmental trajectories and pseudotime from single-cell data using diffusion maps and PAGA",
186
+ category="singlecell",
187
+ parameters={
188
+ "data_path": "Path to h5ad file (ideally pre-clustered from singlecell.cluster)",
189
+ "root_cluster": "Cluster to use as root for pseudotime (optional, auto-detected if not set)",
190
+ "method": "Trajectory method: 'diffmap' (default) or 'paga'",
191
+ },
192
+ usage_guide="You have clustered single-cell data and want to understand differentiation trajectories, lineage relationships, or developmental ordering. Run after singlecell.cluster. Computes pseudotime and identifies branch points.",
193
+ )
194
+ def trajectory(data_path: str, root_cluster: str = None, method: str = "diffmap", **kwargs) -> dict:
195
+ """Infer trajectories using diffusion map + PAGA.
196
+
197
+ Computes diffusion pseudotime from a root cell (selected from root_cluster
198
+ or auto-detected as the cluster with lowest diffusion component 1).
199
+ PAGA provides a coarse-grained graph of cluster connectivity.
200
+ """
201
+ sc = _check_scanpy()
202
+ if sc is None:
203
+ return {
204
+ "error": "scanpy is required for trajectory analysis. Install with: pip install scanpy",
205
+ "summary": "scanpy not installed. Install with: pip install scanpy",
206
+ }
207
+
208
+ import numpy as np
209
+
210
+ # Load data
211
+ try:
212
+ if data_path.endswith(".h5ad"):
213
+ adata = sc.read_h5ad(data_path)
214
+ else:
215
+ return {
216
+ "error": "Trajectory analysis requires h5ad format with pre-computed neighbors",
217
+ "summary": "Use singlecell.cluster first to generate an h5ad file",
218
+ }
219
+ except Exception as e:
220
+ return {"error": f"Failed to load data: {e}", "summary": f"Could not read {data_path}"}
221
+
222
+ n_cells = adata.shape[0]
223
+
224
+ # Ensure neighbors are computed
225
+ if "neighbors" not in adata.uns:
226
+ n_neighbors = min(15, n_cells - 1)
227
+ n_pcs = min(50, adata.shape[1] - 1, n_cells - 1)
228
+ sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
229
+
230
+ # Ensure clustering exists
231
+ cluster_key = None
232
+ for key in ["cluster", "leiden", "louvain"]:
233
+ if key in adata.obs.columns:
234
+ cluster_key = key
235
+ break
236
+ if cluster_key is None:
237
+ return {
238
+ "error": "No cluster assignments found. Run singlecell.cluster first.",
239
+ "summary": "Pre-clustered data required for trajectory analysis",
240
+ }
241
+
242
+ # Compute diffusion map
243
+ sc.tl.diffmap(adata, n_comps=15)
244
+
245
+ # PAGA for coarse-grained connectivity
246
+ sc.tl.paga(adata, groups=cluster_key)
247
+ paga_connectivities = adata.uns["paga"]["connectivities"].toarray()
248
+
249
+ # Determine root cell
250
+ clusters = adata.obs[cluster_key].astype(str)
251
+ if root_cluster is not None:
252
+ root_cluster = str(root_cluster)
253
+ if root_cluster not in clusters.values:
254
+ return {
255
+ "error": f"Root cluster '{root_cluster}' not found. Available: {sorted(clusters.unique())}",
256
+ "summary": f"Invalid root cluster: {root_cluster}",
257
+ }
258
+ # Select root as cell in root_cluster with lowest DC1
259
+ mask = clusters == root_cluster
260
+ dc1_values = adata.obsm["X_diffmap"][mask, 0]
261
+ root_idx_in_cluster = np.argmin(dc1_values)
262
+ root_idx = np.where(mask)[0][root_idx_in_cluster]
263
+ else:
264
+ # Auto-detect: cell with lowest DC1 value
265
+ root_idx = int(np.argmin(adata.obsm["X_diffmap"][:, 0]))
266
+ root_cluster = str(clusters.iloc[root_idx])
267
+
268
+ adata.uns["iroot"] = root_idx
269
+
270
+ # Compute diffusion pseudotime
271
+ sc.tl.dpt(adata)
272
+ pseudotime = adata.obs["dpt_pseudotime"].values
273
+
274
+ # Identify branches (clusters connected in PAGA)
275
+ cluster_names = sorted(clusters.unique(), key=lambda x: int(x) if x.isdigit() else x)
276
+ n_clusters = len(cluster_names)
277
+
278
+ # Find branch points: clusters connected to 3+ other clusters in PAGA
279
+ branch_points = []
280
+ paga_threshold = 0.1
281
+ for i, cl in enumerate(cluster_names):
282
+ n_connections = np.sum(paga_connectivities[i] > paga_threshold)
283
+ if n_connections >= 3:
284
+ branch_points.append({
285
+ "cluster": cl,
286
+ "n_connections": int(n_connections),
287
+ "connected_to": [
288
+ cluster_names[j]
289
+ for j in range(n_clusters)
290
+ if paga_connectivities[i, j] > paga_threshold and i != j
291
+ ],
292
+ })
293
+
294
+ # Pseudotime statistics per cluster
295
+ pseudotime_stats = {}
296
+ for cl in cluster_names:
297
+ mask = clusters == cl
298
+ pt_values = pseudotime[mask]
299
+ valid = pt_values[np.isfinite(pt_values)]
300
+ if len(valid) > 0:
301
+ pseudotime_stats[cl] = {
302
+ "mean": round(float(np.mean(valid)), 4),
303
+ "median": round(float(np.median(valid)), 4),
304
+ "min": round(float(np.min(valid)), 4),
305
+ "max": round(float(np.max(valid)), 4),
306
+ }
307
+
308
+ # Lineage ordering: sort clusters by mean pseudotime
309
+ lineage_order = sorted(
310
+ pseudotime_stats.keys(),
311
+ key=lambda x: pseudotime_stats[x]["mean"],
312
+ )
313
+
314
+ valid_pt = pseudotime[np.isfinite(pseudotime)]
315
+ pt_range = (round(float(np.min(valid_pt)), 4), round(float(np.max(valid_pt)), 4))
316
+
317
+ summary = (
318
+ f"Trajectory analysis: {len(branch_points)} branch point(s) from root "
319
+ f"(cluster {root_cluster}), pseudotime range {pt_range[0]}-{pt_range[1]}"
320
+ )
321
+
322
+ return {
323
+ "summary": summary,
324
+ "n_cells": n_cells,
325
+ "root_cluster": root_cluster,
326
+ "root_cell_index": int(root_idx),
327
+ "method": method,
328
+ "pseudotime_range": pt_range,
329
+ "pseudotime_per_cluster": pseudotime_stats,
330
+ "lineage_order": lineage_order,
331
+ "branch_points": branch_points,
332
+ "n_branches": len(branch_points),
333
+ "paga_connectivities": {
334
+ cluster_names[i]: {
335
+ cluster_names[j]: round(float(paga_connectivities[i, j]), 4)
336
+ for j in range(n_clusters)
337
+ if paga_connectivities[i, j] > paga_threshold and i != j
338
+ }
339
+ for i in range(n_clusters)
340
+ },
341
+ }
342
+
343
+
344
+ @registry.register(
345
+ name="singlecell.cell_type_annotate",
346
+ description="Annotate cell clusters with cell type labels using marker gene panels or CellTypist",
347
+ category="singlecell",
348
+ parameters={
349
+ "data_path": "Path to h5ad file (should be clustered, e.g. from singlecell.cluster)",
350
+ "reference": "Reference panel: 'immune', 'pbmc', 'tissue', or 'all' (default 'immune')",
351
+ "method": "Annotation method: 'marker_based' (default) or 'celltypist'",
352
+ },
353
+ usage_guide="You have clustered single-cell data and need to assign cell type identities. Run after singlecell.cluster. Uses canonical marker genes to score each cluster against known cell type signatures.",
354
+ )
355
+ def cell_type_annotate(data_path: str, reference: str = "immune", method: str = "marker_based", **kwargs) -> dict:
356
+ """Annotate clusters with cell type labels.
357
+
358
+ marker_based: Score each cluster using canonical marker gene panels.
359
+ celltypist: Use CellTypist automated annotation (requires celltypist package).
360
+ """
361
+ if method == "celltypist":
362
+ try:
363
+ import celltypist
364
+ except ImportError:
365
+ return {
366
+ "error": "celltypist is required for automated annotation. Install with: pip install celltypist",
367
+ "summary": "celltypist not installed. Use method='marker_based' or install with: pip install celltypist",
368
+ }
369
+
370
+ sc = _check_scanpy()
371
+ if sc is None:
372
+ return {
373
+ "error": "scanpy is required for cell type annotation. Install with: pip install scanpy",
374
+ "summary": "scanpy not installed. Install with: pip install scanpy",
375
+ }
376
+
377
+ import numpy as np
378
+
379
+ # Load data
380
+ try:
381
+ if data_path.endswith(".h5ad"):
382
+ adata = sc.read_h5ad(data_path)
383
+ elif data_path.endswith(".csv"):
384
+ import pandas as pd
385
+ df = pd.read_csv(data_path, index_col=0)
386
+ from anndata import AnnData
387
+ adata = AnnData(df)
388
+ else:
389
+ return {
390
+ "error": f"Unsupported file format: {data_path}",
391
+ "summary": f"Cannot read {data_path} — expected .h5ad or .csv",
392
+ }
393
+ except Exception as e:
394
+ return {"error": f"Failed to load data: {e}", "summary": f"Could not read {data_path}"}
395
+
396
+ # Find cluster key
397
+ cluster_key = None
398
+ for key in ["cluster", "leiden", "louvain", "cell_type"]:
399
+ if key in adata.obs.columns:
400
+ cluster_key = key
401
+ break
402
+
403
+ if cluster_key is None:
404
+ return {
405
+ "error": "No cluster assignments found. Run singlecell.cluster first.",
406
+ "summary": "Pre-clustered data required for annotation",
407
+ }
408
+
409
+ clusters = adata.obs[cluster_key].astype(str)
410
+ cluster_names = sorted(clusters.unique(), key=lambda x: int(x) if x.isdigit() else x)
411
+ n_clusters = len(cluster_names)
412
+ n_cells = adata.shape[0]
413
+
414
+ # Select marker panels based on reference
415
+ if reference in ("immune", "pbmc"):
416
+ panels = {k: v for k, v in MARKER_PANELS.items()
417
+ if k not in ("Fibroblasts", "Endothelial", "Epithelial")}
418
+ elif reference == "tissue":
419
+ panels = MARKER_PANELS.copy()
420
+ else:
421
+ panels = MARKER_PANELS.copy()
422
+
423
+ if method == "celltypist":
424
+ # CellTypist annotation path
425
+ try:
426
+ import celltypist
427
+ from celltypist import models as ct_models
428
+
429
+ ct_models.download_models(force_update=False)
430
+ model = ct_models.Model.load(model="Immune_All_Low.pkl")
431
+ predictions = celltypist.annotate(adata, model=model, majority_voting=True)
432
+ adata_result = predictions.to_adata()
433
+
434
+ annotations = {}
435
+ for cl in cluster_names:
436
+ mask = clusters == cl
437
+ cl_types = adata_result.obs.loc[mask, "majority_voting"].value_counts()
438
+ top_type = cl_types.index[0] if len(cl_types) > 0 else "Unknown"
439
+ confidence = float(cl_types.iloc[0] / cl_types.sum()) if len(cl_types) > 0 else 0.0
440
+ annotations[cl] = {
441
+ "cell_type": top_type,
442
+ "confidence": round(confidence, 3),
443
+ "n_cells": int(mask.sum()),
444
+ "method": "celltypist",
445
+ }
446
+
447
+ annotation_list = list(annotations.values())
448
+ except Exception as e:
449
+ return {
450
+ "error": f"CellTypist annotation failed: {e}",
451
+ "summary": f"CellTypist error — try method='marker_based' instead",
452
+ }
453
+ else:
454
+ # Marker-based annotation
455
+ gene_names = set(adata.var_names)
456
+
457
+ annotations = {}
458
+ for cl in cluster_names:
459
+ mask = clusters == cl
460
+ n_cells_cl = int(mask.sum())
461
+
462
+ # Get mean expression for this cluster
463
+ if hasattr(adata.X, "toarray"):
464
+ cl_expr = np.array(adata.X[mask].toarray().mean(axis=0)).flatten()
465
+ else:
466
+ cl_expr = np.array(adata.X[mask].mean(axis=0)).flatten()
467
+
468
+ gene_to_idx = {g: i for i, g in enumerate(adata.var_names)}
469
+
470
+ # Score each cell type panel
471
+ scores = {}
472
+ for cell_type, markers in panels.items():
473
+ present_markers = [m for m in markers if m in gene_names]
474
+ if not present_markers:
475
+ continue
476
+ marker_indices = [gene_to_idx[m] for m in present_markers]
477
+ marker_expr = cl_expr[marker_indices]
478
+ # Score = mean expression of present markers, weighted by fraction present
479
+ score = float(np.mean(marker_expr)) * (len(present_markers) / len(markers))
480
+ scores[cell_type] = round(score, 4)
481
+
482
+ if scores:
483
+ best_type = max(scores, key=scores.get)
484
+ best_score = scores[best_type]
485
+ # Confidence: ratio of best score to second-best
486
+ sorted_scores = sorted(scores.values(), reverse=True)
487
+ if len(sorted_scores) > 1 and sorted_scores[1] > 0:
488
+ specificity = sorted_scores[0] / sorted_scores[1]
489
+ else:
490
+ specificity = float("inf") if best_score > 0 else 0.0
491
+ confidence = min(1.0, best_score * min(specificity, 5.0) / 5.0) if best_score > 0 else 0.0
492
+ else:
493
+ best_type = "Unknown"
494
+ best_score = 0.0
495
+ confidence = 0.0
496
+ scores = {}
497
+
498
+ annotations[cl] = {
499
+ "cell_type": best_type,
500
+ "confidence": round(confidence, 3),
501
+ "n_cells": n_cells_cl,
502
+ "marker_score": round(best_score, 4),
503
+ "all_scores": dict(sorted(scores.items(), key=lambda x: -x[1])[:5]),
504
+ "method": "marker_based",
505
+ }
506
+
507
+ annotation_list = list(annotations.values())
508
+
509
+ # Compute cell type distribution
510
+ type_counts = {}
511
+ for ann in annotation_list:
512
+ ct = ann["cell_type"]
513
+ type_counts[ct] = type_counts.get(ct, 0) + ann["n_cells"]
514
+
515
+ total_cells = sum(type_counts.values())
516
+ type_distribution = {
517
+ ct: f"{count / total_cells:.0%}" for ct, count in
518
+ sorted(type_counts.items(), key=lambda x: -x[1])
519
+ }
520
+
521
+ # Summary
522
+ dist_str = ", ".join(f"{ct} ({pct})" for ct, pct in list(type_distribution.items())[:5])
523
+ summary = f"Annotated {n_clusters} clusters: {dist_str}"
524
+
525
+ return {
526
+ "summary": summary,
527
+ "n_cells": n_cells,
528
+ "n_clusters": n_clusters,
529
+ "method": method,
530
+ "reference": reference,
531
+ "annotations": annotations,
532
+ "cell_type_distribution": type_distribution,
533
+ }