spatialcore 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatialcore/__init__.py +122 -0
- spatialcore/annotation/__init__.py +253 -0
- spatialcore/annotation/acquisition.py +529 -0
- spatialcore/annotation/annotate.py +603 -0
- spatialcore/annotation/cellxgene.py +365 -0
- spatialcore/annotation/confidence.py +802 -0
- spatialcore/annotation/discovery.py +529 -0
- spatialcore/annotation/expression.py +363 -0
- spatialcore/annotation/loading.py +529 -0
- spatialcore/annotation/markers.py +297 -0
- spatialcore/annotation/ontology.py +1282 -0
- spatialcore/annotation/patterns.py +247 -0
- spatialcore/annotation/pipeline.py +620 -0
- spatialcore/annotation/synapse.py +380 -0
- spatialcore/annotation/training.py +1457 -0
- spatialcore/annotation/validation.py +422 -0
- spatialcore/core/__init__.py +34 -0
- spatialcore/core/cache.py +118 -0
- spatialcore/core/logging.py +135 -0
- spatialcore/core/metadata.py +149 -0
- spatialcore/core/utils.py +768 -0
- spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
- spatialcore/data/markers/canonical_markers.json +83 -0
- spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
- spatialcore/plotting/__init__.py +109 -0
- spatialcore/plotting/benchmark.py +477 -0
- spatialcore/plotting/celltype.py +329 -0
- spatialcore/plotting/confidence.py +413 -0
- spatialcore/plotting/spatial.py +505 -0
- spatialcore/plotting/utils.py +411 -0
- spatialcore/plotting/validation.py +1342 -0
- spatialcore-0.1.9.dist-info/METADATA +213 -0
- spatialcore-0.1.9.dist-info/RECORD +36 -0
- spatialcore-0.1.9.dist-info/WHEEL +5 -0
- spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
- spatialcore-0.1.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
"""Boolean expression evaluation for ontology IDs.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to evaluate boolean expressions
|
|
4
|
+
on ontology IDs (CL, NCIT, UBERON) for filtering cells in spatial data.
|
|
5
|
+
|
|
6
|
+
Examples
|
|
7
|
+
--------
|
|
8
|
+
>>> from spatialcore.annotation.expression import evaluate_ontology_expression
|
|
9
|
+
>>> import anndata as ad
|
|
10
|
+
>>> adata = ad.read_h5ad("annotated.h5ad")
|
|
11
|
+
>>> # Single ontology filter
|
|
12
|
+
>>> mask = evaluate_ontology_expression("CL:0000236", adata) # B cells
|
|
13
|
+
>>> # AND expression
|
|
14
|
+
>>> mask = evaluate_ontology_expression("CL:0000236 & NCIT:C4349", adata) # B cells AND tumor
|
|
15
|
+
>>> # OR expression
|
|
16
|
+
>>> mask = evaluate_ontology_expression("CL:0000236 | CL:0000624", adata) # B cells OR CD4+ T cells
|
|
17
|
+
>>> # NOT expression
|
|
18
|
+
>>> mask = evaluate_ontology_expression("~NCIT:C4349", adata) # NOT tumor
|
|
19
|
+
>>> # Complex expression with parentheses
|
|
20
|
+
>>> mask = evaluate_ontology_expression("(CL:0000236 | CL:0000624) & ~NCIT:C4349", adata)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
from typing import List, Optional, Set
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
import anndata as ad
|
|
28
|
+
|
|
29
|
+
from spatialcore.core.logging import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
# Default columns to search for ontology IDs
|
|
34
|
+
DEFAULT_ONTOLOGY_COLUMNS = [
|
|
35
|
+
"cell_type_ontology_id",
|
|
36
|
+
"disease_ontology_id",
|
|
37
|
+
"tissue_ontology_id",
|
|
38
|
+
"cell_type_ontology_term_id", # Alternative naming
|
|
39
|
+
"celltype_ontology_id", # Alternative naming
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# Regex pattern for ontology IDs (e.g., CL:0000236, NCIT:C4349, UBERON:0002107)
|
|
43
|
+
ONTOLOGY_ID_PATTERN = re.compile(r"([A-Z]+):([A-Z0-9]+)", re.IGNORECASE)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _tokenize_expression(expression: str) -> List[str]:
|
|
47
|
+
"""
|
|
48
|
+
Tokenize a boolean expression into operators and ontology IDs.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
expression
|
|
53
|
+
Boolean expression string.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
List of tokens: ontology IDs, operators (&, |, ~), and parentheses.
|
|
58
|
+
"""
|
|
59
|
+
# Normalize whitespace
|
|
60
|
+
expression = expression.strip()
|
|
61
|
+
|
|
62
|
+
tokens = []
|
|
63
|
+
i = 0
|
|
64
|
+
while i < len(expression):
|
|
65
|
+
char = expression[i]
|
|
66
|
+
|
|
67
|
+
# Skip whitespace
|
|
68
|
+
if char.isspace():
|
|
69
|
+
i += 1
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
# Operators and parentheses
|
|
73
|
+
if char in "&|~()":
|
|
74
|
+
tokens.append(char)
|
|
75
|
+
i += 1
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Ontology ID (starts with letters, contains colon)
|
|
79
|
+
if char.isalpha():
|
|
80
|
+
# Capture the full ontology ID
|
|
81
|
+
match = ONTOLOGY_ID_PATTERN.match(expression[i:])
|
|
82
|
+
if match:
|
|
83
|
+
tokens.append(match.group(0).upper())
|
|
84
|
+
i += len(match.group(0))
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Invalid token at position {i}: '{expression[i:i+10]}...'. "
|
|
88
|
+
"Expected ontology ID (e.g., CL:0000236, NCIT:C4349)."
|
|
89
|
+
)
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Unexpected character '{char}' at position {i} in expression: '{expression}'"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return tokens
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _find_ontology_columns(adata: ad.AnnData, ontology_columns: Optional[List[str]]) -> List[str]:
|
|
100
|
+
"""
|
|
101
|
+
Find available ontology columns in AnnData.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
adata
|
|
106
|
+
AnnData object.
|
|
107
|
+
ontology_columns
|
|
108
|
+
Explicit list of columns to search, or None to use defaults.
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
List of available column names.
|
|
113
|
+
"""
|
|
114
|
+
if ontology_columns is not None:
|
|
115
|
+
# Validate provided columns exist
|
|
116
|
+
available = [col for col in ontology_columns if col in adata.obs.columns]
|
|
117
|
+
if not available:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"None of the specified ontology columns found in adata.obs. "
|
|
120
|
+
f"Requested: {ontology_columns}. "
|
|
121
|
+
f"Available: {list(adata.obs.columns)}"
|
|
122
|
+
)
|
|
123
|
+
return available
|
|
124
|
+
|
|
125
|
+
# Use defaults
|
|
126
|
+
available = [col for col in DEFAULT_ONTOLOGY_COLUMNS if col in adata.obs.columns]
|
|
127
|
+
if not available:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"No ontology columns found in adata.obs. "
|
|
130
|
+
f"Expected one of: {DEFAULT_ONTOLOGY_COLUMNS}. "
|
|
131
|
+
f"Run add_ontology_ids() first to add ontology annotations."
|
|
132
|
+
)
|
|
133
|
+
return available
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _evaluate_single_id(
|
|
137
|
+
ontology_id: str,
|
|
138
|
+
adata: ad.AnnData,
|
|
139
|
+
ontology_columns: List[str],
|
|
140
|
+
) -> np.ndarray:
|
|
141
|
+
"""
|
|
142
|
+
Evaluate a single ontology ID against all cells.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
ontology_id
|
|
147
|
+
Ontology ID to match (e.g., "CL:0000236").
|
|
148
|
+
adata
|
|
149
|
+
AnnData object.
|
|
150
|
+
ontology_columns
|
|
151
|
+
Columns to search for the ID.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
Boolean mask where True indicates cells matching the ontology ID.
|
|
156
|
+
"""
|
|
157
|
+
# Normalize ID to uppercase
|
|
158
|
+
ontology_id = ontology_id.upper()
|
|
159
|
+
|
|
160
|
+
# Search across all ontology columns
|
|
161
|
+
mask = np.zeros(adata.n_obs, dtype=bool)
|
|
162
|
+
|
|
163
|
+
for col in ontology_columns:
|
|
164
|
+
col_values = adata.obs[col].astype(str).str.upper()
|
|
165
|
+
# Exact match
|
|
166
|
+
mask |= col_values == ontology_id
|
|
167
|
+
|
|
168
|
+
return mask
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _parse_and_evaluate(
|
|
172
|
+
tokens: List[str],
|
|
173
|
+
adata: ad.AnnData,
|
|
174
|
+
ontology_columns: List[str],
|
|
175
|
+
) -> np.ndarray:
|
|
176
|
+
"""
|
|
177
|
+
Parse and evaluate tokenized boolean expression using recursive descent.
|
|
178
|
+
|
|
179
|
+
Grammar:
|
|
180
|
+
expression := term (('|') term)*
|
|
181
|
+
term := factor (('&') factor)*
|
|
182
|
+
factor := '~' factor | '(' expression ')' | ONTOLOGY_ID
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
tokens
|
|
187
|
+
List of tokens from _tokenize_expression.
|
|
188
|
+
adata
|
|
189
|
+
AnnData object.
|
|
190
|
+
ontology_columns
|
|
191
|
+
Columns to search.
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
Boolean mask for cells matching the expression.
|
|
196
|
+
"""
|
|
197
|
+
pos = [0] # Use list to allow mutation in nested functions
|
|
198
|
+
|
|
199
|
+
def peek() -> Optional[str]:
|
|
200
|
+
if pos[0] < len(tokens):
|
|
201
|
+
return tokens[pos[0]]
|
|
202
|
+
return None
|
|
203
|
+
|
|
204
|
+
def consume() -> str:
|
|
205
|
+
token = tokens[pos[0]]
|
|
206
|
+
pos[0] += 1
|
|
207
|
+
return token
|
|
208
|
+
|
|
209
|
+
def parse_expression() -> np.ndarray:
|
|
210
|
+
"""Parse OR expressions."""
|
|
211
|
+
result = parse_term()
|
|
212
|
+
while peek() == "|":
|
|
213
|
+
consume() # consume '|'
|
|
214
|
+
right = parse_term()
|
|
215
|
+
result = result | right
|
|
216
|
+
return result
|
|
217
|
+
|
|
218
|
+
def parse_term() -> np.ndarray:
|
|
219
|
+
"""Parse AND expressions."""
|
|
220
|
+
result = parse_factor()
|
|
221
|
+
while peek() == "&":
|
|
222
|
+
consume() # consume '&'
|
|
223
|
+
right = parse_factor()
|
|
224
|
+
result = result & right
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
def parse_factor() -> np.ndarray:
|
|
228
|
+
"""Parse NOT, parentheses, or ontology IDs."""
|
|
229
|
+
token = peek()
|
|
230
|
+
|
|
231
|
+
if token == "~":
|
|
232
|
+
consume() # consume '~'
|
|
233
|
+
operand = parse_factor()
|
|
234
|
+
return ~operand
|
|
235
|
+
|
|
236
|
+
if token == "(":
|
|
237
|
+
consume() # consume '('
|
|
238
|
+
result = parse_expression()
|
|
239
|
+
if peek() != ")":
|
|
240
|
+
raise ValueError("Missing closing parenthesis in expression")
|
|
241
|
+
consume() # consume ')'
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
if token is None:
|
|
245
|
+
raise ValueError("Unexpected end of expression")
|
|
246
|
+
|
|
247
|
+
# Must be an ontology ID
|
|
248
|
+
if ONTOLOGY_ID_PATTERN.match(token):
|
|
249
|
+
consume()
|
|
250
|
+
return _evaluate_single_id(token, adata, ontology_columns)
|
|
251
|
+
|
|
252
|
+
raise ValueError(f"Unexpected token: '{token}'")
|
|
253
|
+
|
|
254
|
+
result = parse_expression()
|
|
255
|
+
|
|
256
|
+
if pos[0] < len(tokens):
|
|
257
|
+
raise ValueError(
|
|
258
|
+
f"Unexpected token after expression: '{tokens[pos[0]]}'"
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return result
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def evaluate_ontology_expression(
|
|
265
|
+
expression: str,
|
|
266
|
+
adata: ad.AnnData,
|
|
267
|
+
ontology_columns: Optional[List[str]] = None,
|
|
268
|
+
) -> np.ndarray:
|
|
269
|
+
"""
|
|
270
|
+
Evaluate boolean expression on ontology IDs.
|
|
271
|
+
|
|
272
|
+
Supports boolean operators: & (AND), | (OR), ~ (NOT), and parentheses.
|
|
273
|
+
|
|
274
|
+
Parameters
|
|
275
|
+
----------
|
|
276
|
+
expression
|
|
277
|
+
Boolean expression using ontology IDs.
|
|
278
|
+
Examples:
|
|
279
|
+
- "CL:0000236" - B cells
|
|
280
|
+
- "CL:0000236 & NCIT:C4349" - B cells AND tumor
|
|
281
|
+
- "CL:0000236 | CL:0000624" - B cells OR CD4+ T cells
|
|
282
|
+
- "~NCIT:C4349" - NOT tumor
|
|
283
|
+
- "(CL:0000236 | CL:0000624) & ~NCIT:C4349" - (B OR CD4+) AND NOT tumor
|
|
284
|
+
adata
|
|
285
|
+
AnnData object with ontology ID columns in .obs.
|
|
286
|
+
ontology_columns
|
|
287
|
+
Specific columns to search for ontology IDs. If None, searches
|
|
288
|
+
default columns: cell_type_ontology_id, disease_ontology_id,
|
|
289
|
+
tissue_ontology_id, etc.
|
|
290
|
+
|
|
291
|
+
Returns
|
|
292
|
+
-------
|
|
293
|
+
np.ndarray
|
|
294
|
+
Boolean mask (shape: n_obs) where True indicates cells matching
|
|
295
|
+
the expression.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
ValueError
|
|
300
|
+
If expression syntax is invalid or no ontology columns found.
|
|
301
|
+
|
|
302
|
+
Notes
|
|
303
|
+
-----
|
|
304
|
+
Ontology IDs are matched case-insensitively. The function searches
|
|
305
|
+
across all specified ontology columns and returns True if any column
|
|
306
|
+
matches for a cell.
|
|
307
|
+
|
|
308
|
+
Examples
|
|
309
|
+
--------
|
|
310
|
+
>>> import anndata as ad
|
|
311
|
+
>>> from spatialcore.annotation.expression import evaluate_ontology_expression
|
|
312
|
+
>>> adata = ad.read_h5ad("annotated.h5ad")
|
|
313
|
+
>>> # Find B cells
|
|
314
|
+
>>> b_cell_mask = evaluate_ontology_expression("CL:0000236", adata)
|
|
315
|
+
>>> adata_bcells = adata[b_cell_mask]
|
|
316
|
+
>>> # Find B cells in tumor regions
|
|
317
|
+
>>> mask = evaluate_ontology_expression("CL:0000236 & NCIT:C4349", adata)
|
|
318
|
+
"""
|
|
319
|
+
if not expression or not expression.strip():
|
|
320
|
+
raise ValueError("Expression cannot be empty")
|
|
321
|
+
|
|
322
|
+
logger.debug(f"Evaluating ontology expression: {expression}")
|
|
323
|
+
|
|
324
|
+
# Find available ontology columns
|
|
325
|
+
available_columns = _find_ontology_columns(adata, ontology_columns)
|
|
326
|
+
logger.debug(f"Using ontology columns: {available_columns}")
|
|
327
|
+
|
|
328
|
+
# Tokenize expression
|
|
329
|
+
tokens = _tokenize_expression(expression)
|
|
330
|
+
logger.debug(f"Tokenized expression: {tokens}")
|
|
331
|
+
|
|
332
|
+
# Parse and evaluate
|
|
333
|
+
mask = _parse_and_evaluate(tokens, adata, available_columns)
|
|
334
|
+
|
|
335
|
+
n_matching = mask.sum()
|
|
336
|
+
logger.info(
|
|
337
|
+
f"Ontology expression '{expression}' matched {n_matching:,} / {adata.n_obs:,} cells "
|
|
338
|
+
f"({100 * n_matching / adata.n_obs:.1f}%)"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
return mask
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def get_ontology_ids_in_expression(expression: str) -> Set[str]:
|
|
345
|
+
"""
|
|
346
|
+
Extract all ontology IDs from an expression.
|
|
347
|
+
|
|
348
|
+
Parameters
|
|
349
|
+
----------
|
|
350
|
+
expression
|
|
351
|
+
Boolean expression string.
|
|
352
|
+
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
Set of ontology IDs found in the expression.
|
|
356
|
+
|
|
357
|
+
Examples
|
|
358
|
+
--------
|
|
359
|
+
>>> get_ontology_ids_in_expression("CL:0000236 & NCIT:C4349")
|
|
360
|
+
{'CL:0000236', 'NCIT:C4349'}
|
|
361
|
+
"""
|
|
362
|
+
tokens = _tokenize_expression(expression)
|
|
363
|
+
return {token.upper() for token in tokens if ONTOLOGY_ID_PATTERN.match(token)}
|