spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,363 @@
1
+ """Boolean expression evaluation for ontology IDs.
2
+
3
+ This module provides functionality to evaluate boolean expressions
4
+ on ontology IDs (CL, NCIT, UBERON) for filtering cells in spatial data.
5
+
6
+ Examples
7
+ --------
8
+ >>> from spatialcore.annotation.expression import evaluate_ontology_expression
9
+ >>> import anndata as ad
10
+ >>> adata = ad.read_h5ad("annotated.h5ad")
11
+ >>> # Single ontology filter
12
+ >>> mask = evaluate_ontology_expression("CL:0000236", adata) # B cells
13
+ >>> # AND expression
14
+ >>> mask = evaluate_ontology_expression("CL:0000236 & NCIT:C4349", adata) # B cells AND tumor
15
+ >>> # OR expression
16
+ >>> mask = evaluate_ontology_expression("CL:0000236 | CL:0000624", adata) # B cells OR CD4+ T cells
17
+ >>> # NOT expression
18
+ >>> mask = evaluate_ontology_expression("~NCIT:C4349", adata) # NOT tumor
19
+ >>> # Complex expression with parentheses
20
+ >>> mask = evaluate_ontology_expression("(CL:0000236 | CL:0000624) & ~NCIT:C4349", adata)
21
+ """
22
+
23
+ import re
24
+ from typing import List, Optional, Set
25
+
26
+ import numpy as np
27
+ import anndata as ad
28
+
29
+ from spatialcore.core.logging import get_logger
30
+
31
+ logger = get_logger(__name__)
32
+
33
+ # Default columns to search for ontology IDs
34
+ DEFAULT_ONTOLOGY_COLUMNS = [
35
+ "cell_type_ontology_id",
36
+ "disease_ontology_id",
37
+ "tissue_ontology_id",
38
+ "cell_type_ontology_term_id", # Alternative naming
39
+ "celltype_ontology_id", # Alternative naming
40
+ ]
41
+
42
+ # Regex pattern for ontology IDs (e.g., CL:0000236, NCIT:C4349, UBERON:0002107)
43
+ ONTOLOGY_ID_PATTERN = re.compile(r"([A-Z]+):([A-Z0-9]+)", re.IGNORECASE)
44
+
45
+
46
+ def _tokenize_expression(expression: str) -> List[str]:
47
+ """
48
+ Tokenize a boolean expression into operators and ontology IDs.
49
+
50
+ Parameters
51
+ ----------
52
+ expression
53
+ Boolean expression string.
54
+
55
+ Returns
56
+ -------
57
+ List of tokens: ontology IDs, operators (&, |, ~), and parentheses.
58
+ """
59
+ # Normalize whitespace
60
+ expression = expression.strip()
61
+
62
+ tokens = []
63
+ i = 0
64
+ while i < len(expression):
65
+ char = expression[i]
66
+
67
+ # Skip whitespace
68
+ if char.isspace():
69
+ i += 1
70
+ continue
71
+
72
+ # Operators and parentheses
73
+ if char in "&|~()":
74
+ tokens.append(char)
75
+ i += 1
76
+ continue
77
+
78
+ # Ontology ID (starts with letters, contains colon)
79
+ if char.isalpha():
80
+ # Capture the full ontology ID
81
+ match = ONTOLOGY_ID_PATTERN.match(expression[i:])
82
+ if match:
83
+ tokens.append(match.group(0).upper())
84
+ i += len(match.group(0))
85
+ else:
86
+ raise ValueError(
87
+ f"Invalid token at position {i}: '{expression[i:i+10]}...'. "
88
+ "Expected ontology ID (e.g., CL:0000236, NCIT:C4349)."
89
+ )
90
+ continue
91
+
92
+ raise ValueError(
93
+ f"Unexpected character '{char}' at position {i} in expression: '{expression}'"
94
+ )
95
+
96
+ return tokens
97
+
98
+
99
+ def _find_ontology_columns(adata: ad.AnnData, ontology_columns: Optional[List[str]]) -> List[str]:
100
+ """
101
+ Find available ontology columns in AnnData.
102
+
103
+ Parameters
104
+ ----------
105
+ adata
106
+ AnnData object.
107
+ ontology_columns
108
+ Explicit list of columns to search, or None to use defaults.
109
+
110
+ Returns
111
+ -------
112
+ List of available column names.
113
+ """
114
+ if ontology_columns is not None:
115
+ # Validate provided columns exist
116
+ available = [col for col in ontology_columns if col in adata.obs.columns]
117
+ if not available:
118
+ raise ValueError(
119
+ f"None of the specified ontology columns found in adata.obs. "
120
+ f"Requested: {ontology_columns}. "
121
+ f"Available: {list(adata.obs.columns)}"
122
+ )
123
+ return available
124
+
125
+ # Use defaults
126
+ available = [col for col in DEFAULT_ONTOLOGY_COLUMNS if col in adata.obs.columns]
127
+ if not available:
128
+ raise ValueError(
129
+ f"No ontology columns found in adata.obs. "
130
+ f"Expected one of: {DEFAULT_ONTOLOGY_COLUMNS}. "
131
+ f"Run add_ontology_ids() first to add ontology annotations."
132
+ )
133
+ return available
134
+
135
+
136
+ def _evaluate_single_id(
137
+ ontology_id: str,
138
+ adata: ad.AnnData,
139
+ ontology_columns: List[str],
140
+ ) -> np.ndarray:
141
+ """
142
+ Evaluate a single ontology ID against all cells.
143
+
144
+ Parameters
145
+ ----------
146
+ ontology_id
147
+ Ontology ID to match (e.g., "CL:0000236").
148
+ adata
149
+ AnnData object.
150
+ ontology_columns
151
+ Columns to search for the ID.
152
+
153
+ Returns
154
+ -------
155
+ Boolean mask where True indicates cells matching the ontology ID.
156
+ """
157
+ # Normalize ID to uppercase
158
+ ontology_id = ontology_id.upper()
159
+
160
+ # Search across all ontology columns
161
+ mask = np.zeros(adata.n_obs, dtype=bool)
162
+
163
+ for col in ontology_columns:
164
+ col_values = adata.obs[col].astype(str).str.upper()
165
+ # Exact match
166
+ mask |= col_values == ontology_id
167
+
168
+ return mask
169
+
170
+
171
+ def _parse_and_evaluate(
172
+ tokens: List[str],
173
+ adata: ad.AnnData,
174
+ ontology_columns: List[str],
175
+ ) -> np.ndarray:
176
+ """
177
+ Parse and evaluate tokenized boolean expression using recursive descent.
178
+
179
+ Grammar:
180
+ expression := term (('|') term)*
181
+ term := factor (('&') factor)*
182
+ factor := '~' factor | '(' expression ')' | ONTOLOGY_ID
183
+
184
+ Parameters
185
+ ----------
186
+ tokens
187
+ List of tokens from _tokenize_expression.
188
+ adata
189
+ AnnData object.
190
+ ontology_columns
191
+ Columns to search.
192
+
193
+ Returns
194
+ -------
195
+ Boolean mask for cells matching the expression.
196
+ """
197
+ pos = [0] # Use list to allow mutation in nested functions
198
+
199
+ def peek() -> Optional[str]:
200
+ if pos[0] < len(tokens):
201
+ return tokens[pos[0]]
202
+ return None
203
+
204
+ def consume() -> str:
205
+ token = tokens[pos[0]]
206
+ pos[0] += 1
207
+ return token
208
+
209
+ def parse_expression() -> np.ndarray:
210
+ """Parse OR expressions."""
211
+ result = parse_term()
212
+ while peek() == "|":
213
+ consume() # consume '|'
214
+ right = parse_term()
215
+ result = result | right
216
+ return result
217
+
218
+ def parse_term() -> np.ndarray:
219
+ """Parse AND expressions."""
220
+ result = parse_factor()
221
+ while peek() == "&":
222
+ consume() # consume '&'
223
+ right = parse_factor()
224
+ result = result & right
225
+ return result
226
+
227
+ def parse_factor() -> np.ndarray:
228
+ """Parse NOT, parentheses, or ontology IDs."""
229
+ token = peek()
230
+
231
+ if token == "~":
232
+ consume() # consume '~'
233
+ operand = parse_factor()
234
+ return ~operand
235
+
236
+ if token == "(":
237
+ consume() # consume '('
238
+ result = parse_expression()
239
+ if peek() != ")":
240
+ raise ValueError("Missing closing parenthesis in expression")
241
+ consume() # consume ')'
242
+ return result
243
+
244
+ if token is None:
245
+ raise ValueError("Unexpected end of expression")
246
+
247
+ # Must be an ontology ID
248
+ if ONTOLOGY_ID_PATTERN.match(token):
249
+ consume()
250
+ return _evaluate_single_id(token, adata, ontology_columns)
251
+
252
+ raise ValueError(f"Unexpected token: '{token}'")
253
+
254
+ result = parse_expression()
255
+
256
+ if pos[0] < len(tokens):
257
+ raise ValueError(
258
+ f"Unexpected token after expression: '{tokens[pos[0]]}'"
259
+ )
260
+
261
+ return result
262
+
263
+
264
+ def evaluate_ontology_expression(
265
+ expression: str,
266
+ adata: ad.AnnData,
267
+ ontology_columns: Optional[List[str]] = None,
268
+ ) -> np.ndarray:
269
+ """
270
+ Evaluate boolean expression on ontology IDs.
271
+
272
+ Supports boolean operators: & (AND), | (OR), ~ (NOT), and parentheses.
273
+
274
+ Parameters
275
+ ----------
276
+ expression
277
+ Boolean expression using ontology IDs.
278
+ Examples:
279
+ - "CL:0000236" - B cells
280
+ - "CL:0000236 & NCIT:C4349" - B cells AND tumor
281
+ - "CL:0000236 | CL:0000624" - B cells OR CD4+ T cells
282
+ - "~NCIT:C4349" - NOT tumor
283
+ - "(CL:0000236 | CL:0000624) & ~NCIT:C4349" - (B OR CD4+) AND NOT tumor
284
+ adata
285
+ AnnData object with ontology ID columns in .obs.
286
+ ontology_columns
287
+ Specific columns to search for ontology IDs. If None, searches
288
+ default columns: cell_type_ontology_id, disease_ontology_id,
289
+ tissue_ontology_id, etc.
290
+
291
+ Returns
292
+ -------
293
+ np.ndarray
294
+ Boolean mask (shape: n_obs) where True indicates cells matching
295
+ the expression.
296
+
297
+ Raises
298
+ ------
299
+ ValueError
300
+ If expression syntax is invalid or no ontology columns found.
301
+
302
+ Notes
303
+ -----
304
+ Ontology IDs are matched case-insensitively. The function searches
305
+ across all specified ontology columns and returns True if any column
306
+ matches for a cell.
307
+
308
+ Examples
309
+ --------
310
+ >>> import anndata as ad
311
+ >>> from spatialcore.annotation.expression import evaluate_ontology_expression
312
+ >>> adata = ad.read_h5ad("annotated.h5ad")
313
+ >>> # Find B cells
314
+ >>> b_cell_mask = evaluate_ontology_expression("CL:0000236", adata)
315
+ >>> adata_bcells = adata[b_cell_mask]
316
+ >>> # Find B cells in tumor regions
317
+ >>> mask = evaluate_ontology_expression("CL:0000236 & NCIT:C4349", adata)
318
+ """
319
+ if not expression or not expression.strip():
320
+ raise ValueError("Expression cannot be empty")
321
+
322
+ logger.debug(f"Evaluating ontology expression: {expression}")
323
+
324
+ # Find available ontology columns
325
+ available_columns = _find_ontology_columns(adata, ontology_columns)
326
+ logger.debug(f"Using ontology columns: {available_columns}")
327
+
328
+ # Tokenize expression
329
+ tokens = _tokenize_expression(expression)
330
+ logger.debug(f"Tokenized expression: {tokens}")
331
+
332
+ # Parse and evaluate
333
+ mask = _parse_and_evaluate(tokens, adata, available_columns)
334
+
335
+ n_matching = mask.sum()
336
+ logger.info(
337
+ f"Ontology expression '{expression}' matched {n_matching:,} / {adata.n_obs:,} cells "
338
+ f"({100 * n_matching / adata.n_obs:.1f}%)"
339
+ )
340
+
341
+ return mask
342
+
343
+
344
+ def get_ontology_ids_in_expression(expression: str) -> Set[str]:
345
+ """
346
+ Extract all ontology IDs from an expression.
347
+
348
+ Parameters
349
+ ----------
350
+ expression
351
+ Boolean expression string.
352
+
353
+ Returns
354
+ -------
355
+ Set of ontology IDs found in the expression.
356
+
357
+ Examples
358
+ --------
359
+ >>> get_ontology_ids_in_expression("CL:0000236 & NCIT:C4349")
360
+ {'CL:0000236', 'NCIT:C4349'}
361
+ """
362
+ tokens = _tokenize_expression(expression)
363
+ return {token.upper() for token in tokens if ONTOLOGY_ID_PATTERN.match(token)}