spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1282 @@
1
+ """
2
+ Ontology mapping: Convert cell type labels to Cell Ontology (CL) codes.
3
+
4
+ This module provides a 4-tier matching system:
5
+ 1. Tier 0: Pattern canonicalization (known abbreviations → CL term names)
6
+ 2. Tier 1: Exact match (score 1.0)
7
+ 3. Tier 2: Token-based match (score 0.60-0.85)
8
+ 4. Tier 3: Word overlap fallback (score 0.5-0.7)
9
+
10
+ The system uses a pre-built JSON index for fast, offline operation.
11
+
12
+ References:
13
+ - Cell Ontology (CL): https://github.com/obophenotype/cell-ontology
14
+ - NCI Thesaurus (NCIT): https://ncithesaurus.nci.nih.gov/
15
+ """
16
+
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Dict, List, Optional, Tuple, Any, Union
20
+ from datetime import datetime
21
+ import re
22
+ import json
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ import anndata as ad
27
+
28
+ from spatialcore.core.logging import get_logger
29
+ from spatialcore.annotation.patterns import CELL_TYPE_PATTERNS, get_canonical_term
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ # ============================================================================
35
+ # Constants: Unknown Cell Type Definition
36
+ # ============================================================================
37
+
38
+ # For unmapped labels that don't match any ontology term, we use a
39
+ # standardized "Unknown" category. CL has no native "unknown cell" term.
40
+ UNKNOWN_CELL_TYPE_ID = "unknown"
41
+ UNKNOWN_CELL_TYPE_NAME = "Unknown"
42
+
43
+
44
+ # ============================================================================
45
+ # Data Classes for Structured Results
46
+ # ============================================================================
47
+
48
+ @dataclass
49
+ class OntologyMappingResult:
50
+ """
51
+ Complete result of an ontology mapping operation.
52
+
53
+ Contains the mapping table, metadata, and any errors encountered.
54
+ This is the primary output structure from create_mapping_table().
55
+
56
+ Attributes
57
+ ----------
58
+ table : pd.DataFrame
59
+ Mapping table with columns: input_label, ontology_name, ontology_id,
60
+ match_tier, score, n_cells, canonical_term
61
+ metadata : Dict[str, Any]
62
+ Full metadata dictionary for JSON serialization
63
+ errors : List[Dict[str, Any]]
64
+ List of errors/warnings encountered during mapping
65
+ """
66
+
67
+ table: pd.DataFrame
68
+ metadata: Dict[str, Any] = field(default_factory=dict)
69
+ errors: List[Dict[str, Any]] = field(default_factory=list)
70
+
71
+ def to_json(self, path: Union[str, Path]) -> Path:
72
+ """
73
+ Save metadata to JSON file.
74
+
75
+ Parameters
76
+ ----------
77
+ path : str or Path
78
+ Output path for JSON file.
79
+
80
+ Returns
81
+ -------
82
+ Path
83
+ Path to saved file.
84
+ """
85
+ path = Path(path)
86
+ with open(path, "w", encoding="utf-8") as f:
87
+ json.dump(self.metadata, f, indent=2, default=str)
88
+ return path
89
+
90
+ def to_csv(self, path: Union[str, Path]) -> Path:
91
+ """
92
+ Save mapping table to CSV.
93
+
94
+ Parameters
95
+ ----------
96
+ path : str or Path
97
+ Output path for CSV file.
98
+
99
+ Returns
100
+ -------
101
+ Path
102
+ Path to saved file.
103
+ """
104
+ path = Path(path)
105
+ self.table.to_csv(path, index=False)
106
+ return path
107
+
108
+
109
+ # ============================================================================
110
+ # Index Loading
111
+ # ============================================================================
112
+
113
+ _ONTOLOGY_INDEX_CACHE: Optional[Dict] = None
114
+
115
+
116
+ def load_ontology_index(
117
+ index_path: Optional[Union[str, Path]] = None,
118
+ use_cache: bool = True,
119
+ ) -> Dict[str, Dict[str, Dict[str, str]]]:
120
+ """
121
+ Load pre-built ontology index from JSON file.
122
+
123
+ Parameters
124
+ ----------
125
+ index_path : str or Path, optional
126
+ Path to ontology_index.json. If None, uses default location.
127
+ use_cache : bool, default True
128
+ Cache the loaded index for faster subsequent calls.
129
+
130
+ Returns
131
+ -------
132
+ Dict[str, Dict[str, Dict[str, str]]]
133
+ Nested dictionary: {ontology: {label_lower: {id, name}}}
134
+ - ontology: "cl", "ncit", or "uberon"
135
+ - label_lower: lowercase term name
136
+ - id: ontology ID (e.g., "CL:0000624")
137
+ - name: canonical term name
138
+
139
+ Examples
140
+ --------
141
+ >>> from spatialcore.annotation import load_ontology_index
142
+ >>> index = load_ontology_index()
143
+ >>> index["cl"]["b cell"]
144
+ {'id': 'CL:0000236', 'name': 'B cell'}
145
+ """
146
+ global _ONTOLOGY_INDEX_CACHE
147
+
148
+ if use_cache and _ONTOLOGY_INDEX_CACHE is not None:
149
+ return _ONTOLOGY_INDEX_CACHE
150
+
151
+ if index_path is None:
152
+ # Default: look in package data directory first, then fallback locations
153
+ possible_paths = []
154
+
155
+ # Primary: Package data directory
156
+ package_data_path = Path(__file__).parent.parent / "data" / "ontology_mappings" / "ontology_index.json"
157
+ possible_paths.append(package_data_path)
158
+
159
+ # Fallback: User cache
160
+ possible_paths.append(
161
+ Path.home() / ".cache" / "spatialcore" / "ontology_index.json"
162
+ )
163
+
164
+ for path in possible_paths:
165
+ if path.exists():
166
+ index_path = path
167
+ break
168
+ else:
169
+ raise FileNotFoundError(
170
+ f"Ontology index not found. Searched: {[str(p) for p in possible_paths]}. "
171
+ "Ensure spatialcore is installed correctly with data files."
172
+ )
173
+ else:
174
+ index_path = Path(index_path)
175
+
176
+ logger.info(f"Loading ontology index from: {index_path}")
177
+
178
+ with open(index_path, "r", encoding="utf-8") as f:
179
+ raw_index = json.load(f)
180
+
181
+ # Extract term dictionaries (skip metadata)
182
+ index = {
183
+ "cl": raw_index.get("cl", {}),
184
+ "ncit": raw_index.get("ncit", {}),
185
+ "uberon": raw_index.get("uberon", {}),
186
+ }
187
+
188
+ # Log stats
189
+ if "metadata" in raw_index:
190
+ meta = raw_index["metadata"]
191
+ logger.info(
192
+ f" CL: {meta.get('cl_terms', len(index['cl'])):,} terms, "
193
+ f"NCIT: {meta.get('ncit_terms', len(index['ncit'])):,} terms, "
194
+ f"UBERON: {meta.get('uberon_terms', len(index['uberon'])):,} terms"
195
+ )
196
+
197
+ if use_cache:
198
+ _ONTOLOGY_INDEX_CACHE = index
199
+
200
+ return index
201
+
202
+
203
+ # ============================================================================
204
+ # Token Extraction
205
+ # ============================================================================
206
+
207
+ # Words that are too generic for meaningful matching
208
+ GENERIC_TERMS = {"cell", "cells", "type", "like"}
209
+
210
+ # CL IDs that are too generic to be valid match results
211
+ # These terms exist in the ontology but should never be returned as matches
212
+ # because they provide no useful information about cell type identity
213
+ BLACKLISTED_CL_IDS = {
214
+ "CL:0000000", # "cell" - root term, too generic
215
+ "CL:0000003", # "native cell" - too generic
216
+ "CL:0000255", # "eukaryotic cell" - too generic
217
+ }
218
+
219
+ # Modifiers that describe state, not identity
220
+ MODIFIER_TERMS = {
221
+ "positive", "negative", "high", "low", "like", "type",
222
+ "mature", "immature", "activated", "resting", "proliferating",
223
+ "pro", "pre", "post", "inflammatory", "naive", "memory",
224
+ "effector", "resident", "circulating",
225
+ }
226
+
227
+ # Short tokens that ARE meaningful for cell types
228
+ MEANINGFUL_SHORT_TOKENS = {
229
+ "b", "t", "nk", "dc", "ec", "ve", "ta",
230
+ "m1", "m2", "cd", "th", "ilc",
231
+ }
232
+
233
+
234
+ def extract_biological_tokens(label: str) -> Dict[str, List[str]]:
235
+ """
236
+ Extract key biological identifiers from a cell type label.
237
+
238
+ Parameters
239
+ ----------
240
+ label : str
241
+ Cell type label to tokenize.
242
+
243
+ Returns
244
+ -------
245
+ Dict[str, List[str]]
246
+ Dictionary with keys:
247
+ - markers: CD markers (cd4, cd8, cd19, ...)
248
+ - proteins: Immunoglobulins, gene names (igg, iga, spp1, ...)
249
+ - core_words: Main biological terms (helper, plasma, ...)
250
+ - modifiers: Descriptors (positive, mature, ...)
251
+
252
+ Examples
253
+ --------
254
+ >>> tokens = extract_biological_tokens("CD4+ T cells")
255
+ >>> tokens["markers"]
256
+ ['cd4']
257
+ >>> tokens["core_words"]
258
+ ['t']
259
+ """
260
+ label_lower = label.lower().strip()
261
+ tokens = {
262
+ "markers": [],
263
+ "proteins": [],
264
+ "core_words": [],
265
+ "modifiers": [],
266
+ }
267
+
268
+ # Extract CD markers: CD4, CD8, CD19, etc.
269
+ cd_markers = re.findall(r"cd\d+", label_lower)
270
+ tokens["markers"].extend(cd_markers)
271
+
272
+ # Extract immunoglobulin types: IgG, IgA, IgM
273
+ ig_types = re.findall(r"ig[gamedGAMED]", label_lower)
274
+ tokens["proteins"].extend([ig.lower() for ig in ig_types])
275
+
276
+ # Extract gene names (uppercase + plus sign pattern)
277
+ gene_markers = re.findall(r"\b[A-Z0-9]{3,}\+", label)
278
+ tokens["proteins"].extend([g.replace("+", "").lower() for g in gene_markers])
279
+
280
+ # Clean and extract core words
281
+ cleaned = re.sub(r"cd\d+", "", label_lower) # Remove CD markers
282
+ cleaned = re.sub(r"ig[gamed]", "", cleaned) # Remove Ig types
283
+ cleaned = re.sub(r"[+\-]", " ", cleaned) # Replace +/- with space
284
+ cleaned = re.sub(r"\d+", "", cleaned) # Remove numbers
285
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
286
+
287
+ for word in cleaned.split():
288
+ if word in MODIFIER_TERMS:
289
+ tokens["modifiers"].append(word)
290
+ elif word in GENERIC_TERMS:
291
+ pass # Skip generic terms
292
+ elif word in MEANINGFUL_SHORT_TOKENS:
293
+ tokens["core_words"].append(word)
294
+ elif len(word) > 1:
295
+ tokens["core_words"].append(word)
296
+
297
+ return tokens
298
+
299
+
300
+ # ============================================================================
301
+ # Scoring Functions
302
+ # ============================================================================
303
+
304
+ def _score_match(
305
+ search_label: str,
306
+ term_label: str,
307
+ tokens: Dict[str, List[str]],
308
+ is_pattern_match: bool,
309
+ ) -> float:
310
+ """
311
+ Calculate match score between search label and ontology term.
312
+
313
+ Parameters
314
+ ----------
315
+ search_label : str
316
+ Canonicalized search label.
317
+ term_label : str
318
+ Ontology term label (lowercase).
319
+ tokens : Dict
320
+ Extracted tokens from search label.
321
+ is_pattern_match : bool
322
+ Whether search_label came from pattern canonicalization.
323
+
324
+ Returns
325
+ -------
326
+ float
327
+ Match score (0.0 to 1.0).
328
+ """
329
+ search_clean = search_label.lower().strip()
330
+ term_clean = term_label.lower().strip()
331
+
332
+ # Tier 1: Exact match
333
+ if search_clean == term_clean:
334
+ return 0.95 if is_pattern_match else 1.0
335
+
336
+ # Tier 1b: Clean match (remove symbols)
337
+ search_no_symbols = re.sub(r"[+\-,]", " ", search_clean)
338
+ search_no_symbols = re.sub(r"\s+", " ", search_no_symbols).strip()
339
+ term_no_symbols = re.sub(r"[+\-,]", " ", term_clean)
340
+ term_no_symbols = re.sub(r"\s+", " ", term_no_symbols).strip()
341
+
342
+ if search_no_symbols == term_no_symbols:
343
+ return 0.92 if is_pattern_match else 0.95
344
+
345
+ # Tier 1c: Word boundary contains match (avoid false positives)
346
+ if len(search_clean) >= 4:
347
+ # Only match if it's a word boundary match, not arbitrary substring
348
+ if re.search(rf'\b{re.escape(search_clean)}\b', term_clean):
349
+ return 0.88 if is_pattern_match else 0.90
350
+ elif re.search(rf'\b{re.escape(term_clean)}\b', search_clean):
351
+ return 0.86 if is_pattern_match else 0.88
352
+
353
+ # Tier 2: Token-based matching
354
+ term_words = set(term_clean.replace("-", " ").replace(",", " ").split())
355
+ core_words = tokens.get("core_words", [])
356
+ markers = tokens.get("markers", [])
357
+
358
+ if core_words:
359
+ # Check if all core words present (exact word match only)
360
+ matches_all_core = all(
361
+ any(word == tw for tw in term_words)
362
+ for word in core_words
363
+ )
364
+
365
+ if matches_all_core:
366
+ base_score = 0.70
367
+
368
+ # Penalty for single short token (too ambiguous)
369
+ if len(core_words) == 1 and len(core_words[0]) <= 2:
370
+ base_score -= 0.15
371
+
372
+ # Penalty for unwanted prefixes in term but not in label
373
+ unwanted_prefixes = ["pro", "pre", "post", "immature", "ecto", "endo"]
374
+ label_has_prefix = any(p in search_clean for p in unwanted_prefixes)
375
+ term_has_prefix = any(p in term_clean for p in unwanted_prefixes)
376
+ if term_has_prefix and not label_has_prefix:
377
+ base_score -= 0.15
378
+
379
+ # Bonus if markers also match
380
+ if markers and any(m in term_clean for m in markers):
381
+ base_score = max(base_score, 0.75)
382
+
383
+ # Bonus for word count similarity
384
+ if len(core_words) >= 2:
385
+ base_score = min(base_score + 0.05, 0.85)
386
+
387
+ return max(base_score, 0.0)
388
+
389
+ # Tier 3: Word overlap (Jaccard similarity)
390
+ label_words = set(search_clean.replace("-", " ").replace(",", " ").split())
391
+ label_words -= GENERIC_TERMS
392
+
393
+ if label_words and term_words:
394
+ common = label_words & term_words
395
+ jaccard = len(common) / len(label_words | term_words)
396
+ score = 0.5 + (0.4 * jaccard) # Range: 0.5-0.9
397
+ return score
398
+
399
+ return 0.0
400
+
401
+
402
+ # ============================================================================
403
+ # Main Search Function
404
+ # ============================================================================
405
+
406
+ def search_ontology_index(
407
+ labels: List[str],
408
+ ontology_index: Optional[Dict] = None,
409
+ index_path: Optional[Union[str, Path]] = None,
410
+ annotation_type: str = "cell_type",
411
+ min_score: float = 0.7,
412
+ ) -> Dict[str, List[Dict[str, Any]]]:
413
+ """
414
+ Search ontology index for matching terms.
415
+
416
+ Uses a 4-tier matching system:
417
+ 1. Tier 0: Pattern canonicalization
418
+ 2. Tier 1: Exact/partial match
419
+ 3. Tier 2: Token-based match
420
+ 4. Tier 3: Word overlap fallback
421
+
422
+ Parameters
423
+ ----------
424
+ labels : List[str]
425
+ Cell type labels to search.
426
+ ontology_index : Dict, optional
427
+ Pre-loaded ontology index. If None, loads from file.
428
+ index_path : str or Path, optional
429
+ Path to ontology index JSON.
430
+ annotation_type : str, default "cell_type"
431
+ Type of annotation:
432
+ - "cell_type": Search CL only (most specific)
433
+ - "pathology": Search NCIT first, then CL
434
+ - "anatomy": Search UBERON first, then CL
435
+ - "all": Search CL, NCIT, UBERON
436
+ min_score : float, default 0.7
437
+ Minimum match score to accept.
438
+
439
+ Returns
440
+ -------
441
+ Dict[str, List[Dict[str, Any]]]
442
+ {label: [{id, name, ontology, score, match_type}, ...]}
443
+
444
+ Examples
445
+ --------
446
+ >>> from spatialcore.annotation import search_ontology_index
447
+ >>> results = search_ontology_index(["CD4+ T cells", "B cell", "NK cells"])
448
+ >>> results["CD4+ T cells"][0]
449
+ {'id': 'CL:0000624', 'name': 'CD4-positive, alpha-beta T cell', 'score': 0.95, ...}
450
+ """
451
+ if ontology_index is None:
452
+ ontology_index = load_ontology_index(index_path)
453
+
454
+ # Select ontologies based on annotation type
455
+ if annotation_type == "cell_type":
456
+ ontologies = ["cl"]
457
+ elif annotation_type == "pathology":
458
+ ontologies = ["ncit", "cl"]
459
+ elif annotation_type == "anatomy":
460
+ ontologies = ["uberon", "cl"]
461
+ else:
462
+ ontologies = ["cl", "ncit", "uberon"]
463
+
464
+ results = {label: [] for label in labels}
465
+
466
+ for label in labels:
467
+ label_lower = label.lower().strip().replace("_", " ")
468
+ label_normalized = label.replace("_", " ")
469
+
470
+ # Tier 0: Pattern canonicalization
471
+ canonical_term = get_canonical_term(label_normalized)
472
+ search_label = canonical_term if canonical_term else label_lower
473
+ is_pattern_match = canonical_term is not None
474
+
475
+ # Extract tokens for fuzzy matching
476
+ tokens = extract_biological_tokens(
477
+ canonical_term if canonical_term else label_normalized
478
+ )
479
+
480
+ # Search across ontologies
481
+ tier_results = []
482
+
483
+ for onto_prefix in ontologies:
484
+ onto_dict = ontology_index.get(onto_prefix.lower(), {})
485
+ ontology_matches = []
486
+
487
+ # Tier 1: Exact lookup
488
+ if search_label in onto_dict:
489
+ term = onto_dict[search_label]
490
+ ontology_matches.append({
491
+ "id": term["id"],
492
+ "name": term["name"],
493
+ "ontology": onto_prefix,
494
+ "score": 0.95 if is_pattern_match else 1.0,
495
+ "match_type": "tier0_pattern" if is_pattern_match else "tier1_exact",
496
+ })
497
+ else:
498
+ # Tier 2-3: Fuzzy matching
499
+ for term_label_lower, term_data in onto_dict.items():
500
+ # Skip imported terms (e.g., UBERON term in CL)
501
+ term_id_prefix = term_data["id"].split(":")[0].upper()
502
+ if term_id_prefix != onto_prefix.upper():
503
+ continue
504
+
505
+ # Skip obsolete terms
506
+ if "obsolete" in term_data["name"].lower():
507
+ continue
508
+
509
+ # Skip blacklisted generic terms (e.g., CL:0000000 "cell")
510
+ if term_data["id"] in BLACKLISTED_CL_IDS:
511
+ continue
512
+
513
+ score = _score_match(search_label, term_label_lower, tokens, is_pattern_match)
514
+
515
+ if score >= min_score:
516
+ ontology_matches.append({
517
+ "id": term_data["id"],
518
+ "name": term_data["name"],
519
+ "ontology": onto_prefix,
520
+ "score": score,
521
+ "match_type": "tier2_token" if score >= 0.7 else "tier3_overlap",
522
+ })
523
+
524
+ if ontology_matches:
525
+ tier_results.extend(ontology_matches)
526
+ # If good CL match found, don't search other ontologies
527
+ if onto_prefix == "cl" and any(m["score"] >= 0.8 for m in ontology_matches):
528
+ break
529
+
530
+ # Sort by score, deduplicate by ID
531
+ seen_ids = set()
532
+ unique_results = []
533
+ for result in sorted(tier_results, key=lambda x: x["score"], reverse=True):
534
+ if result["id"] not in seen_ids:
535
+ seen_ids.add(result["id"])
536
+ unique_results.append(result)
537
+
538
+ results[label] = unique_results
539
+
540
+ return results
541
+
542
+
543
+ # ============================================================================
544
+ # Mapping Result Saving
545
+ # ============================================================================
546
+
547
+ def _save_ontology_mapping_results(
548
+ save_dir: Path,
549
+ mappings: Dict[str, List[Dict]],
550
+ adata: ad.AnnData,
551
+ source_col: str,
552
+ dataset_name: str,
553
+ ) -> Tuple[Path, Optional[Path]]:
554
+ """
555
+ Save ontology mapping results for reproducibility.
556
+
557
+ Creates two files:
558
+ - {dataset}_{timestamp}_mapping.json: Full mapping results
559
+ - {dataset}_{timestamp}_missed.json: Unmapped terms (if any)
560
+
561
+ Parameters
562
+ ----------
563
+ save_dir : Path
564
+ Directory to save results.
565
+ mappings : Dict
566
+ Mapping results from search_ontology_index.
567
+ adata : AnnData
568
+ Source data for cell counts.
569
+ source_col : str
570
+ Source column name.
571
+ dataset_name : str
572
+ Name for output files.
573
+
574
+ Returns
575
+ -------
576
+ Tuple[Path, Optional[Path]]
577
+ Paths to mapping.json and missed.json (or None if no missed).
578
+ """
579
+ save_dir = Path(save_dir)
580
+ save_dir.mkdir(parents=True, exist_ok=True)
581
+
582
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
583
+ base_name = f"{dataset_name}_{timestamp}"
584
+
585
+ try:
586
+ import spatialcore
587
+ spatialcore_version = spatialcore.__version__
588
+ except Exception:
589
+ spatialcore_version = "unknown"
590
+
591
+ # Count cells per label
592
+ cell_counts = adata.obs[source_col].value_counts().to_dict()
593
+
594
+ # Build mapping results
595
+ mapped_entries = []
596
+ missed_entries = []
597
+ tier_counts = {"pattern": 0, "exact": 0, "token": 0, "overlap": 0, "unmapped": 0}
598
+
599
+ for label, matches in mappings.items():
600
+ n_cells = int(cell_counts.get(label, 0))
601
+
602
+ if matches:
603
+ best = matches[0]
604
+ tier = best.get("match_type", "unknown")
605
+
606
+ # Simplify tier names for counting
607
+ if "pattern" in tier:
608
+ tier_counts["pattern"] += 1
609
+ elif "exact" in tier:
610
+ tier_counts["exact"] += 1
611
+ elif "token" in tier:
612
+ tier_counts["token"] += 1
613
+ else:
614
+ tier_counts["overlap"] += 1
615
+
616
+ canonical = get_canonical_term(label)
617
+ mapped_entries.append({
618
+ "input_label": label,
619
+ "canonical_term": canonical,
620
+ "ontology_id": best["id"],
621
+ "ontology_name": best["name"],
622
+ "match_tier": tier,
623
+ "score": round(best["score"], 3),
624
+ "n_cells": n_cells,
625
+ })
626
+ else:
627
+ tier_counts["unmapped"] += 1
628
+ # Get closest matches for missed terms
629
+ closest = search_ontology_index([label], min_score=0.4).get(label, [])[:3]
630
+ missed_entries.append({
631
+ "input_label": label,
632
+ "n_cells": n_cells,
633
+ "closest_matches": [
634
+ {"term": m["name"], "id": m["id"], "score": round(m["score"], 2)}
635
+ for m in closest
636
+ ],
637
+ "suggested_action": "manual_review",
638
+ })
639
+
640
+ # Save main mapping file
641
+ mapping_data = {
642
+ "dataset": dataset_name,
643
+ "created_at": datetime.now().isoformat(),
644
+ "spatialcore_version": spatialcore_version,
645
+ "source_column": source_col,
646
+ "summary": {
647
+ "total_labels": len(mappings),
648
+ "mapped": len(mapped_entries),
649
+ "unmapped": len(missed_entries),
650
+ "match_rate": round(len(mapped_entries) / len(mappings), 3) if mappings else 0,
651
+ },
652
+ "tier_breakdown": tier_counts,
653
+ "mappings": mapped_entries,
654
+ }
655
+
656
+ mapping_path = save_dir / f"{base_name}_mapping.json"
657
+ with open(mapping_path, "w") as f:
658
+ json.dump(mapping_data, f, indent=2)
659
+ logger.info(f"Saved mapping results to: {mapping_path}")
660
+
661
+ # Save missed terms file (if any)
662
+ missed_path = None
663
+ if missed_entries:
664
+ missed_data = {
665
+ "dataset": dataset_name,
666
+ "created_at": datetime.now().isoformat(),
667
+ "source_column": source_col,
668
+ "missed_terms": missed_entries,
669
+ "recommendations": [
670
+ f"Review '{m['input_label']}' ({m['n_cells']} cells) - may need manual mapping"
671
+ for m in missed_entries[:5]
672
+ ],
673
+ }
674
+ missed_path = save_dir / f"{base_name}_missed.json"
675
+ with open(missed_path, "w") as f:
676
+ json.dump(missed_data, f, indent=2)
677
+ logger.info(f"Saved missed terms to: {missed_path}")
678
+
679
+ return mapping_path, missed_path
680
+
681
+
682
+ # ============================================================================
683
+ # Mapping Table and Metadata Generation
684
+ # ============================================================================
685
+
686
+
687
+ def create_mapping_table(
688
+ mappings: Dict[str, List[Dict]],
689
+ cell_counts: Dict[str, int],
690
+ skipped_labels: Optional[List[str]] = None,
691
+ index_source: Optional[str] = None,
692
+ min_score: float = 0.7,
693
+ dataset_name: str = "ontology_mapping",
694
+ ) -> OntologyMappingResult:
695
+ """
696
+ Create a structured mapping table from search results.
697
+
698
+ Produces a DataFrame and metadata suitable for visualization and
699
+ JSON export. This is the primary interface for extracting mapping
700
+ results in a structured format.
701
+
702
+ Parameters
703
+ ----------
704
+ mappings : Dict[str, List[Dict]]
705
+ Search results from search_ontology_index().
706
+ cell_counts : Dict[str, int]
707
+ Number of cells per input label.
708
+ skipped_labels : List[str], optional
709
+ Labels that were skipped (e.g., "Unassigned").
710
+ index_source : str, optional
711
+ Path or description of the ontology index used.
712
+ min_score : float, default 0.7
713
+ Minimum score threshold used for matching.
714
+ dataset_name : str, default "ontology_mapping"
715
+ Name for the dataset (used in metadata).
716
+
717
+ Returns
718
+ -------
719
+ OntologyMappingResult
720
+ Contains:
721
+ - table: DataFrame with columns [input_label, ontology_name,
722
+ ontology_id, match_tier, score, n_cells, canonical_term]
723
+ - metadata: Full metadata dict for JSON serialization
724
+ - errors: List of mapping errors/warnings
725
+
726
+ Examples
727
+ --------
728
+ >>> from spatialcore.annotation import search_ontology_index, create_mapping_table
729
+ >>> labels = ["CD4+ T cells", "B cells", "Unknown_cluster"]
730
+ >>> mappings = search_ontology_index(labels)
731
+ >>> cell_counts = {"CD4+ T cells": 1000, "B cells": 500, "Unknown_cluster": 50}
732
+ >>> result = create_mapping_table(mappings, cell_counts)
733
+ >>> print(result.table)
734
+ >>> result.to_json("mapping_metadata.json")
735
+ """
736
+ try:
737
+ import spatialcore
738
+ spatialcore_version = spatialcore.__version__
739
+ except Exception:
740
+ spatialcore_version = "unknown"
741
+
742
+ skipped_labels = skipped_labels or []
743
+ errors = []
744
+
745
+ # Build rows for the table
746
+ rows = []
747
+ tier_counts = {
748
+ "tier0_pattern": 0,
749
+ "tier1_exact": 0,
750
+ "tier2_token": 0,
751
+ "tier3_overlap": 0,
752
+ "unmapped": 0,
753
+ "skipped": 0,
754
+ }
755
+
756
+ # Process mapped labels
757
+ for label, matches in mappings.items():
758
+ n_cells = cell_counts.get(label, 0)
759
+ canonical = get_canonical_term(label)
760
+
761
+ if matches:
762
+ best = matches[0]
763
+ tier = best.get("match_type", "unknown")
764
+ score = best.get("score", 0.0)
765
+
766
+ # Normalize tier name for counting
767
+ if "pattern" in tier:
768
+ tier_counts["tier0_pattern"] += 1
769
+ elif "exact" in tier:
770
+ tier_counts["tier1_exact"] += 1
771
+ elif "token" in tier:
772
+ tier_counts["tier2_token"] += 1
773
+ elif "overlap" in tier:
774
+ tier_counts["tier3_overlap"] += 1
775
+ else:
776
+ tier_counts["unmapped"] += 1
777
+
778
+ rows.append({
779
+ "input_label": label,
780
+ "ontology_name": best["name"],
781
+ "ontology_id": best["id"],
782
+ "match_tier": tier,
783
+ "score": round(score, 3),
784
+ "n_cells": n_cells,
785
+ "canonical_term": canonical,
786
+ })
787
+ else:
788
+ # Unmapped - use Unknown
789
+ tier_counts["unmapped"] += 1
790
+ rows.append({
791
+ "input_label": label,
792
+ "ontology_name": UNKNOWN_CELL_TYPE_NAME,
793
+ "ontology_id": UNKNOWN_CELL_TYPE_ID,
794
+ "match_tier": "unmapped",
795
+ "score": 0.0,
796
+ "n_cells": n_cells,
797
+ "canonical_term": canonical,
798
+ })
799
+
800
+ # Record as error for review
801
+ errors.append({
802
+ "type": "unmapped",
803
+ "label": label,
804
+ "n_cells": n_cells,
805
+ "message": f"No ontology match found for '{label}'",
806
+ })
807
+
808
+ # Process skipped labels (Unassigned, Unknown, etc.)
809
+ for label in skipped_labels:
810
+ n_cells = cell_counts.get(label, 0)
811
+ tier_counts["skipped"] += 1
812
+ rows.append({
813
+ "input_label": label,
814
+ "ontology_name": label, # Keep original
815
+ "ontology_id": "skipped",
816
+ "match_tier": "skipped",
817
+ "score": None,
818
+ "n_cells": n_cells,
819
+ "canonical_term": None,
820
+ })
821
+
822
+ # Create DataFrame
823
+ table = pd.DataFrame(rows)
824
+
825
+ # Sort by tier (best matches first), then by cell count within tier
826
+ if len(table) > 0:
827
+ # Define tier order (best to worst)
828
+ tier_order = {
829
+ "tier1_exact": 0,
830
+ "tier0_pattern": 1,
831
+ "tier2_token": 2,
832
+ "tier3_overlap": 3,
833
+ "unmapped": 4,
834
+ "skipped": 5,
835
+ }
836
+ table["_tier_order"] = table["match_tier"].map(tier_order).fillna(6)
837
+ table = table.sort_values(
838
+ ["_tier_order", "n_cells"],
839
+ ascending=[True, False]
840
+ ).drop(columns=["_tier_order"]).reset_index(drop=True)
841
+
842
+ # Calculate summary statistics
843
+ total_labels = len(mappings) + len(skipped_labels)
844
+ mapped_labels = sum(1 for r in rows if r["match_tier"] not in ["unmapped", "skipped"])
845
+ unmapped_labels = tier_counts["unmapped"]
846
+ skipped_count = tier_counts["skipped"]
847
+
848
+ total_cells = sum(cell_counts.values())
849
+ mapped_cells = sum(r["n_cells"] for r in rows if r["match_tier"] not in ["unmapped", "skipped"])
850
+ unmapped_cells = sum(r["n_cells"] for r in rows if r["match_tier"] == "unmapped")
851
+
852
+ # Build metadata
853
+ metadata = {
854
+ "dataset_name": dataset_name,
855
+ "created_at": datetime.now().isoformat(),
856
+ "spatialcore_version": spatialcore_version,
857
+ "index_source": str(index_source) if index_source else "package_default",
858
+ "min_score": min_score,
859
+ "summary": {
860
+ "total_labels": total_labels,
861
+ "mapped_labels": mapped_labels,
862
+ "unmapped_labels": unmapped_labels,
863
+ "skipped_labels": skipped_count,
864
+ "mapping_rate": round(mapped_labels / max(total_labels - skipped_count, 1), 3),
865
+ "total_cells": total_cells,
866
+ "mapped_cells": mapped_cells,
867
+ "unmapped_cells": unmapped_cells,
868
+ "cell_mapping_rate": round(mapped_cells / max(total_cells, 1), 3),
869
+ },
870
+ "tier_breakdown": tier_counts,
871
+ "mappings": rows,
872
+ "errors": errors,
873
+ }
874
+
875
+ return OntologyMappingResult(
876
+ table=table,
877
+ metadata=metadata,
878
+ errors=errors,
879
+ )
880
+
881
+
882
+ # ============================================================================
883
+ # AnnData Integration
884
+ # ============================================================================
885
+
886
+ # Labels that should not be mapped to ontology (placeholders, not cell types)
887
+ SKIP_LABELS = {
888
+ "Unassigned", "unassigned", "Unknown", "unknown", "NA", "N/A", "nan",
889
+ "Other", "other", "Doublet", "doublet", "Doublets", "doublets",
890
+ "Low quality", "low quality", "Filtered", "filtered",
891
+ }
892
+
893
+
894
+ def has_ontology_ids(
895
+ adata: ad.AnnData,
896
+ id_col: str = "cell_type_ontology_term_id",
897
+ label_col: str = "cell_type",
898
+ ) -> Dict[str, Any]:
899
+ """
900
+ Check if AnnData has existing ontology IDs and their coverage.
901
+
902
+ Use this to decide whether label harmonization is needed before
903
+ subsample_balanced() and training.
904
+
905
+ Parameters
906
+ ----------
907
+ adata : AnnData
908
+ Data to check (typically after combine_references()).
909
+ id_col : str, default "cell_type_ontology_term_id"
910
+ Column that may contain existing CL IDs.
911
+
912
+ Returns
913
+ -------
914
+ Dict[str, Any]
915
+ Dictionary with keys:
916
+ - has_column: bool - whether id_col exists in adata.obs
917
+ - coverage: float - fraction of cells with valid CL IDs (0.0-1.0)
918
+ - n_with_ids: int - count of cells with valid CL IDs
919
+ - n_without_ids: int - count of cells without valid CL IDs
920
+ - unique_ids: List[str] - unique CL IDs found
921
+ - by_source: Dict[str, float] - coverage by reference_source (if present)
922
+
923
+ Examples
924
+ --------
925
+ >>> from spatialcore.annotation import has_ontology_ids
926
+ >>> status = has_ontology_ids(combined)
927
+ >>> print(f"Coverage: {status['coverage']:.1%}")
928
+ Coverage: 65.0%
929
+
930
+ >>> # Check per-source coverage
931
+ >>> status['by_source']
932
+ {'cellxgene_lung': 1.0, 'inhouse_batch1': 0.0}
933
+
934
+ >>> # Decision logic
935
+ >>> if status['coverage'] < 1.0:
936
+ ... print("Harmonization recommended")
937
+ """
938
+ result = {
939
+ "has_column": False,
940
+ "coverage": 0.0,
941
+ "n_with_ids": 0,
942
+ "n_without_ids": adata.n_obs,
943
+ "unique_ids": [],
944
+ "by_source": {},
945
+ }
946
+
947
+ if id_col not in adata.obs.columns:
948
+ logger.info(f"Column '{id_col}' not found in adata.obs")
949
+ return result
950
+
951
+ result["has_column"] = True
952
+
953
+ # Check for valid CL IDs (not null and starts with "CL:")
954
+ ids = adata.obs[id_col]
955
+ valid_mask = ids.notna() & ids.astype(str).str.startswith("CL:")
956
+
957
+ result["n_with_ids"] = int(valid_mask.sum())
958
+ result["n_without_ids"] = int((~valid_mask).sum())
959
+ result["coverage"] = result["n_with_ids"] / adata.n_obs if adata.n_obs > 0 else 0.0
960
+ result["unique_ids"] = ids[valid_mask].unique().tolist()
961
+
962
+ # Calculate per-source coverage if reference_source exists
963
+ if "reference_source" in adata.obs.columns:
964
+ by_source = {}
965
+ for source in adata.obs["reference_source"].unique():
966
+ source_mask = adata.obs["reference_source"] == source
967
+ source_valid = valid_mask & source_mask
968
+ source_total = source_mask.sum()
969
+ by_source[source] = source_valid.sum() / source_total if source_total > 0 else 0.0
970
+ result["by_source"] = by_source
971
+
972
+ logger.info(
973
+ f"Ontology ID coverage: {result['coverage']:.1%} "
974
+ f"({result['n_with_ids']:,}/{adata.n_obs:,} cells)"
975
+ )
976
+
977
+ return result
978
+
979
+
980
+ def add_ontology_ids(
981
+ adata: ad.AnnData,
982
+ source_col: str,
983
+ target_col: str = "cell_type_ontology_term_id",
984
+ name_col: Optional[str] = "cell_type_ontology_label",
985
+ min_score: float = 0.7,
986
+ index_path: Optional[Union[str, Path]] = None,
987
+ save_mapping: Optional[Union[str, Path]] = None,
988
+ dataset_name: Optional[str] = None,
989
+ skip_labels: Optional[set] = None,
990
+ skip_if_exists: bool = True,
991
+ copy: bool = False,
992
+ ) -> Tuple[ad.AnnData, Dict[str, List[Dict]], Optional[OntologyMappingResult]]:
993
+ """
994
+ Add ontology IDs to AnnData based on cell type labels.
995
+
996
+ Uses CellxGene standard column naming conventions by default.
997
+
998
+ Parameters
999
+ ----------
1000
+ adata : AnnData
1001
+ AnnData object with cell type labels.
1002
+ source_col : str
1003
+ Column in adata.obs containing cell type labels.
1004
+ target_col : str, default "cell_type_ontology_term_id"
1005
+ Column to store ontology IDs (CellxGene standard).
1006
+ name_col : str, optional, default "cell_type_ontology_label"
1007
+ Column to store canonical ontology names. If None, skip.
1008
+ min_score : float, default 0.7
1009
+ Minimum match score.
1010
+ index_path : str or Path, optional
1011
+ Path to ontology index JSON file. If None, uses package default.
1012
+ save_mapping : str or Path, optional
1013
+ Directory to save mapping results. Creates:
1014
+ - {dataset}_ontology_mapping.csv: Mapping table
1015
+ - {dataset}_ontology_mapping_metadata.json: Full metadata
1016
+ dataset_name : str, optional
1017
+ Name for output files. If None, uses 'ontology_mapping'.
1018
+ skip_labels : set, optional
1019
+ Labels to skip (mark as unmapped). If None, uses SKIP_LABELS default
1020
+ which includes "Unassigned", "Unknown", "Doublet", etc.
1021
+ skip_if_exists : bool, default True
1022
+ If True and target_col already exists with valid CL IDs, preserve
1023
+ existing IDs and only map cells with missing/invalid IDs. This is
1024
+ useful when combining CellxGene data (which has native CL IDs) with
1025
+ other references that need mapping.
1026
+ copy : bool, default False
1027
+ If True, return a copy.
1028
+
1029
+ Returns
1030
+ -------
1031
+ Tuple[AnnData, Dict, Optional[OntologyMappingResult]]
1032
+ - Updated AnnData with new columns (including _tier and _score)
1033
+ - Mapping dictionary with all matches per label
1034
+ - OntologyMappingResult with table and metadata (if save_mapping provided)
1035
+
1036
+ Notes
1037
+ -----
1038
+ The function adds the following columns to adata.obs:
1039
+ - {target_col}: Ontology ID (e.g., "CL:0000624") or "unknown"/"skipped"
1040
+ - {name_col}: Canonical ontology name (if name_col is not None)
1041
+ - {target_col}_tier: Match tier (tier0_pattern, tier1_exact, etc.)
1042
+ - {target_col}_score: Match score (0.0-1.0)
1043
+
1044
+ Examples
1045
+ --------
1046
+ >>> from spatialcore.annotation import add_ontology_ids
1047
+ >>> adata, mappings, result = add_ontology_ids(
1048
+ ... adata,
1049
+ ... source_col="celltypist",
1050
+ ... save_mapping="./output/",
1051
+ ... )
1052
+ >>> # View the mapping table
1053
+ >>> print(result.table)
1054
+ >>> # Save metadata JSON
1055
+ >>> result.to_json("ontology_mapping_metadata.json")
1056
+ """
1057
+ if copy:
1058
+ adata = adata.copy()
1059
+
1060
+ if source_col not in adata.obs.columns:
1061
+ raise ValueError(f"Source column '{source_col}' not found in adata.obs")
1062
+
1063
+ # Use default skip labels if not provided
1064
+ labels_to_skip = skip_labels if skip_labels is not None else SKIP_LABELS
1065
+
1066
+ # Check for existing valid CL IDs
1067
+ labels_with_existing_ids = set()
1068
+ existing_id_map = {} # label -> existing CL ID
1069
+ existing_name_map = {} # label -> existing ontology name
1070
+
1071
+ if skip_if_exists and target_col in adata.obs.columns:
1072
+ # Load ontology index for name lookups
1073
+ ontology_index = load_ontology_index(index_path)
1074
+ cl_index = ontology_index.get("cl", {})
1075
+
1076
+ # Build ID -> name lookup
1077
+ id_to_name = {}
1078
+ for term_lower, term_data in cl_index.items():
1079
+ id_to_name[term_data["id"]] = term_data["name"]
1080
+
1081
+ # Check which labels already have valid CL IDs
1082
+ for label in adata.obs[source_col].dropna().unique():
1083
+ label_mask = adata.obs[source_col] == label
1084
+ existing_ids = adata.obs.loc[label_mask, target_col].dropna()
1085
+
1086
+ # Find valid CL IDs
1087
+ valid_ids = [
1088
+ eid for eid in existing_ids.unique()
1089
+ if isinstance(eid, str) and eid.startswith("CL:")
1090
+ ]
1091
+
1092
+ if valid_ids:
1093
+ # Use the most common valid ID
1094
+ id_counts = existing_ids.value_counts()
1095
+ for top_id in id_counts.index:
1096
+ if isinstance(top_id, str) and top_id.startswith("CL:"):
1097
+ existing_id_map[label] = top_id
1098
+ existing_name_map[label] = id_to_name.get(top_id, str(label))
1099
+ labels_with_existing_ids.add(label)
1100
+ break
1101
+
1102
+ if labels_with_existing_ids:
1103
+ logger.info(
1104
+ f"Preserving existing CL IDs for {len(labels_with_existing_ids)} labels "
1105
+ f"(skip_if_exists=True)"
1106
+ )
1107
+
1108
+ # Get unique labels
1109
+ unique_labels = adata.obs[source_col].dropna().unique().tolist()
1110
+
1111
+ # Filter out skip labels and labels with existing IDs before searching
1112
+ labels_to_map = [
1113
+ l for l in unique_labels
1114
+ if l not in labels_to_skip and l not in labels_with_existing_ids
1115
+ ]
1116
+ skipped = [l for l in unique_labels if l in labels_to_skip]
1117
+
1118
+ if skipped:
1119
+ logger.info(f"Skipping {len(skipped)} non-cell-type labels: {skipped}")
1120
+
1121
+ logger.info(f"Mapping {len(labels_to_map)} unique cell types to ontology...")
1122
+
1123
+ # Search for matches (only labels_to_map, not skipped ones)
1124
+ mappings = search_ontology_index(
1125
+ labels_to_map,
1126
+ index_path=index_path,
1127
+ annotation_type="cell_type", # CL-only
1128
+ min_score=min_score,
1129
+ )
1130
+
1131
+ # Create label → ID, name, tier, and score mappings
1132
+ label_to_id = {}
1133
+ label_to_name = {}
1134
+ label_to_tier = {}
1135
+ label_to_score = {}
1136
+
1137
+ n_matched = 0
1138
+ n_unmatched = 0
1139
+ n_preserved = 0
1140
+
1141
+ # Handle labels with existing IDs (preserved from input data)
1142
+ for label in labels_with_existing_ids:
1143
+ label_to_id[label] = existing_id_map[label]
1144
+ label_to_name[label] = existing_name_map[label]
1145
+ label_to_tier[label] = "existing"
1146
+ label_to_score[label] = 1.0 # Full confidence for existing IDs
1147
+ n_preserved += 1
1148
+
1149
+ # Handle skipped labels (mark as skipped)
1150
+ for label in skipped:
1151
+ label_to_id[label] = "skipped"
1152
+ label_to_name[label] = label # Keep original
1153
+ label_to_tier[label] = "skipped"
1154
+ label_to_score[label] = None
1155
+
1156
+ # Process search results
1157
+ for label, matches in mappings.items():
1158
+ if matches:
1159
+ best_match = matches[0]
1160
+ label_to_id[label] = best_match["id"]
1161
+ label_to_name[label] = best_match["name"]
1162
+ label_to_tier[label] = best_match.get("match_type", "unknown")
1163
+ label_to_score[label] = round(best_match.get("score", 0.0), 3)
1164
+ n_matched += 1
1165
+ else:
1166
+ label_to_id[label] = UNKNOWN_CELL_TYPE_ID
1167
+ label_to_name[label] = UNKNOWN_CELL_TYPE_NAME
1168
+ label_to_tier[label] = "unmapped"
1169
+ label_to_score[label] = 0.0
1170
+ n_unmatched += 1
1171
+
1172
+ total_to_map = len(labels_to_map)
1173
+ if total_to_map > 0:
1174
+ logger.info(f" Matched: {n_matched}/{total_to_map} ({100*n_matched/total_to_map:.1f}%)")
1175
+ else:
1176
+ logger.info(" No labels to map")
1177
+ if n_preserved > 0:
1178
+ logger.info(f" Preserved (existing CL IDs): {n_preserved}")
1179
+ if n_unmatched > 0:
1180
+ unmatched = [l for l, m in mappings.items() if not m]
1181
+ logger.warning(f" Unmatched labels: {unmatched[:5]}{'...' if len(unmatched) > 5 else ''}")
1182
+
1183
+ # Apply mappings to adata
1184
+ adata.obs[target_col] = adata.obs[source_col].map(label_to_id)
1185
+ if name_col:
1186
+ adata.obs[name_col] = adata.obs[source_col].map(label_to_name)
1187
+
1188
+ # Store tier and score information (derive column names from target_col)
1189
+ # Handle both old format (cell_type_ontology_id) and new CellxGene format (cell_type_ontology_term_id)
1190
+ if "_term_id" in target_col:
1191
+ tier_col = target_col.replace("_term_id", "_tier")
1192
+ score_col = target_col.replace("_term_id", "_score")
1193
+ else:
1194
+ tier_col = target_col.replace("_id", "_tier")
1195
+ score_col = target_col.replace("_id", "_score")
1196
+ adata.obs[tier_col] = adata.obs[source_col].map(label_to_tier)
1197
+ adata.obs[score_col] = adata.obs[source_col].map(label_to_score)
1198
+
1199
+ # Create mapping result with table and metadata
1200
+ mapping_result = None
1201
+ if save_mapping:
1202
+ save_dir = Path(save_mapping)
1203
+ save_dir.mkdir(parents=True, exist_ok=True)
1204
+
1205
+ # Count cells per label
1206
+ cell_counts = adata.obs[source_col].value_counts().to_dict()
1207
+
1208
+ # Create structured mapping result
1209
+ mapping_result = create_mapping_table(
1210
+ mappings=mappings,
1211
+ cell_counts=cell_counts,
1212
+ skipped_labels=list(skipped),
1213
+ index_source=str(index_path) if index_path else None,
1214
+ min_score=min_score,
1215
+ dataset_name=dataset_name or "ontology_mapping",
1216
+ )
1217
+
1218
+ # Save artifacts
1219
+ name = dataset_name or "ontology_mapping"
1220
+ csv_path = save_dir / f"{name}_ontology_mapping.csv"
1221
+ json_path = save_dir / f"{name}_ontology_mapping_metadata.json"
1222
+
1223
+ mapping_result.to_csv(csv_path)
1224
+ mapping_result.to_json(json_path)
1225
+
1226
+ logger.info(f"Saved mapping table to: {csv_path}")
1227
+ logger.info(f"Saved mapping metadata to: {json_path}")
1228
+
1229
+ return adata, mappings, mapping_result
1230
+
1231
+
1232
+ def validate_cl_term(term_id: str, ontology_index: Optional[Dict] = None) -> bool:
1233
+ """
1234
+ Check if a CL term ID exists in the ontology.
1235
+
1236
+ Parameters
1237
+ ----------
1238
+ term_id : str
1239
+ Cell Ontology ID (e.g., "CL:0000624").
1240
+ ontology_index : Dict, optional
1241
+ Pre-loaded ontology index.
1242
+
1243
+ Returns
1244
+ -------
1245
+ bool
1246
+ True if term exists in CL ontology.
1247
+ """
1248
+ if ontology_index is None:
1249
+ ontology_index = load_ontology_index()
1250
+
1251
+ cl_index = ontology_index.get("cl", {})
1252
+ for term_data in cl_index.values():
1253
+ if term_data.get("id") == term_id:
1254
+ return True
1255
+ return False
1256
+
1257
+
1258
+ def get_cl_id(term_name: str, ontology_index: Optional[Dict] = None) -> Optional[str]:
1259
+ """
1260
+ Get CL ID for a term name (exact match).
1261
+
1262
+ Parameters
1263
+ ----------
1264
+ term_name : str
1265
+ Term name to look up.
1266
+ ontology_index : Dict, optional
1267
+ Pre-loaded ontology index.
1268
+
1269
+ Returns
1270
+ -------
1271
+ str or None
1272
+ CL ID if found, None otherwise.
1273
+ """
1274
+ if ontology_index is None:
1275
+ ontology_index = load_ontology_index()
1276
+
1277
+ term_lower = term_name.lower().strip()
1278
+ cl_index = ontology_index.get("cl", {})
1279
+
1280
+ if term_lower in cl_index:
1281
+ return cl_index[term_lower]["id"]
1282
+ return None