spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,529 @@
1
+ """
2
+ Data acquisition utilities for reference datasets.
3
+
4
+ This module provides a unified interface for downloading reference data
5
+ from various sources (CellxGene, Synapse) and storing to local or cloud
6
+ destinations (GCS, S3).
7
+
8
+ This is Phase 1 of the training workflow - run once upstream to acquire
9
+ and store reference data before training.
10
+
11
+ Workflow
12
+ --------
13
+ 1. Use acquire_reference() to download from source → store to destination
14
+ 2. Use train_and_annotate() with the stored paths for training
15
+
16
+ Example
17
+ -------
18
+ >>> from spatialcore.annotation import acquire_reference
19
+ >>> # Download from CellxGene → store to GCS
20
+ >>> path = acquire_reference(
21
+ ... source="cellxgene://human_lung_cell_atlas",
22
+ ... output="gs://my-bucket/references/hlca.h5ad",
23
+ ... )
24
+ >>> # Later, use in training pipeline
25
+ >>> adata = train_and_annotate(
26
+ ... spatial_adata,
27
+ ... references=["gs://my-bucket/references/hlca.h5ad"],
28
+ ... tissue="lung",
29
+ ... )
30
+ """
31
+
32
+ import os
33
+ import tempfile
34
+ from pathlib import Path
35
+ from typing import Dict, Optional, Union, Any
36
+
37
+ import anndata as ad
38
+
39
+ from spatialcore.core.logging import get_logger
40
+
41
+ logger = get_logger(__name__)
42
+
43
+
44
+ # ============================================================================
45
+ # Cloud I/O Utilities
46
+ # ============================================================================
47
+
48
+ def _upload_to_gcs(local_path: Path, gcs_uri: str) -> str:
49
+ """
50
+ Upload a local file to Google Cloud Storage.
51
+
52
+ Parameters
53
+ ----------
54
+ local_path : Path
55
+ Local file to upload.
56
+ gcs_uri : str
57
+ Destination URI (gs://bucket/path/file.h5ad).
58
+
59
+ Returns
60
+ -------
61
+ str
62
+ The GCS URI.
63
+ """
64
+ try:
65
+ from google.cloud import storage
66
+ except ImportError:
67
+ raise ImportError(
68
+ "google-cloud-storage is required for GCS uploads. "
69
+ "Install with: pip install google-cloud-storage"
70
+ )
71
+
72
+ # Parse GCS URI: gs://bucket/path/to/file.h5ad
73
+ if not gcs_uri.startswith("gs://"):
74
+ raise ValueError(f"Invalid GCS URI: {gcs_uri}. Must start with gs://")
75
+
76
+ uri_path = gcs_uri[5:] # Remove "gs://"
77
+ parts = uri_path.split("/", 1)
78
+ bucket_name = parts[0]
79
+ blob_path = parts[1] if len(parts) > 1 else Path(local_path).name
80
+
81
+ logger.info(f"Uploading to GCS: {gcs_uri}")
82
+ client = storage.Client()
83
+ bucket = client.bucket(bucket_name)
84
+ blob = bucket.blob(blob_path)
85
+ blob.upload_from_filename(str(local_path))
86
+ logger.info(f"Upload complete: {gcs_uri}")
87
+
88
+ return gcs_uri
89
+
90
+
91
+ def _upload_to_s3(local_path: Path, s3_uri: str) -> str:
92
+ """
93
+ Upload a local file to Amazon S3.
94
+
95
+ Parameters
96
+ ----------
97
+ local_path : Path
98
+ Local file to upload.
99
+ s3_uri : str
100
+ Destination URI (s3://bucket/path/file.h5ad).
101
+
102
+ Returns
103
+ -------
104
+ str
105
+ The S3 URI.
106
+ """
107
+ try:
108
+ import boto3
109
+ except ImportError:
110
+ raise ImportError(
111
+ "boto3 is required for S3 uploads. "
112
+ "Install with: pip install boto3"
113
+ )
114
+
115
+ # Parse S3 URI: s3://bucket/path/to/file.h5ad
116
+ if not s3_uri.startswith("s3://"):
117
+ raise ValueError(f"Invalid S3 URI: {s3_uri}. Must start with s3://")
118
+
119
+ uri_path = s3_uri[5:] # Remove "s3://"
120
+ parts = uri_path.split("/", 1)
121
+ bucket_name = parts[0]
122
+ object_key = parts[1] if len(parts) > 1 else Path(local_path).name
123
+
124
+ logger.info(f"Uploading to S3: {s3_uri}")
125
+ s3_client = boto3.client("s3")
126
+ s3_client.upload_file(str(local_path), bucket_name, object_key)
127
+ logger.info(f"Upload complete: {s3_uri}")
128
+
129
+ return s3_uri
130
+
131
+
132
+ def _write_output(adata: ad.AnnData, output: Union[str, Path]) -> str:
133
+ """
134
+ Write AnnData to local path or cloud storage.
135
+
136
+ Parameters
137
+ ----------
138
+ adata : AnnData
139
+ Data to write.
140
+ output : str or Path
141
+ Output location. Supports:
142
+ - Local path: /data/refs/lung.h5ad
143
+ - GCS: gs://bucket/refs/lung.h5ad
144
+ - S3: s3://bucket/refs/lung.h5ad
145
+
146
+ Returns
147
+ -------
148
+ str
149
+ The output path/URI.
150
+ """
151
+ output_str = str(output)
152
+
153
+ if output_str.startswith("gs://"):
154
+ # Write to temp file, upload to GCS
155
+ with tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) as tmp:
156
+ tmp_path = Path(tmp.name)
157
+ try:
158
+ logger.info(f"Writing to temporary file: {tmp_path}")
159
+ adata.write_h5ad(tmp_path)
160
+ return _upload_to_gcs(tmp_path, output_str)
161
+ finally:
162
+ if tmp_path.exists():
163
+ tmp_path.unlink()
164
+
165
+ elif output_str.startswith("s3://"):
166
+ # Write to temp file, upload to S3
167
+ with tempfile.NamedTemporaryFile(suffix=".h5ad", delete=False) as tmp:
168
+ tmp_path = Path(tmp.name)
169
+ try:
170
+ logger.info(f"Writing to temporary file: {tmp_path}")
171
+ adata.write_h5ad(tmp_path)
172
+ return _upload_to_s3(tmp_path, output_str)
173
+ finally:
174
+ if tmp_path.exists():
175
+ tmp_path.unlink()
176
+
177
+ else:
178
+ # Local path
179
+ local_path = Path(output_str)
180
+ local_path.parent.mkdir(parents=True, exist_ok=True)
181
+ logger.info(f"Writing to local file: {local_path}")
182
+ adata.write_h5ad(local_path)
183
+ return str(local_path)
184
+
185
+
186
+ def _download_from_gcs(gcs_uri: str, local_path: Path) -> Path:
187
+ """Download file from GCS to local path."""
188
+ try:
189
+ from google.cloud import storage
190
+ except ImportError:
191
+ raise ImportError(
192
+ "google-cloud-storage is required for GCS. "
193
+ "Install with: pip install google-cloud-storage"
194
+ )
195
+
196
+ if not gcs_uri.startswith("gs://"):
197
+ raise ValueError(f"Invalid GCS URI: {gcs_uri}")
198
+
199
+ uri_path = gcs_uri[5:]
200
+ parts = uri_path.split("/", 1)
201
+ bucket_name = parts[0]
202
+ blob_path = parts[1] if len(parts) > 1 else ""
203
+
204
+ local_path.parent.mkdir(parents=True, exist_ok=True)
205
+
206
+ logger.info(f"Downloading from GCS: {gcs_uri}")
207
+ client = storage.Client()
208
+ bucket = client.bucket(bucket_name)
209
+ blob = bucket.blob(blob_path)
210
+ blob.download_to_filename(str(local_path))
211
+ logger.info(f"Downloaded to: {local_path}")
212
+
213
+ return local_path
214
+
215
+
216
+ def _download_from_s3(s3_uri: str, local_path: Path) -> Path:
217
+ """Download file from S3 to local path."""
218
+ try:
219
+ import boto3
220
+ from botocore.exceptions import ClientError
221
+ except ImportError:
222
+ raise ImportError(
223
+ "boto3 is required for S3. "
224
+ "Install with: pip install boto3"
225
+ )
226
+
227
+ if not s3_uri.startswith("s3://"):
228
+ raise ValueError(f"Invalid S3 URI: {s3_uri}")
229
+
230
+ uri_path = s3_uri[5:]
231
+ parts = uri_path.split("/", 1)
232
+ bucket_name = parts[0]
233
+ object_key = parts[1] if len(parts) > 1 else ""
234
+
235
+ local_path.parent.mkdir(parents=True, exist_ok=True)
236
+
237
+ logger.info(f"Downloading from S3: {s3_uri}")
238
+ try:
239
+ s3_client = boto3.client("s3")
240
+ s3_client.download_file(bucket_name, object_key, str(local_path))
241
+ logger.info(f"Downloaded to: {local_path}")
242
+ except ClientError as e:
243
+ error_code = e.response.get("Error", {}).get("Code", "Unknown")
244
+ if error_code == "404":
245
+ raise FileNotFoundError(f"S3 object not found: {s3_uri}")
246
+ elif error_code == "403":
247
+ raise PermissionError(f"Access denied to S3 object: {s3_uri}")
248
+ raise
249
+
250
+ return local_path
251
+
252
+
253
+ def resolve_uri_to_local(
254
+ uri: str,
255
+ cache_dir: Path,
256
+ force: bool = False,
257
+ ) -> Path:
258
+ """
259
+ Resolve a URI to a local file path, downloading if necessary.
260
+
261
+ Parameters
262
+ ----------
263
+ uri : str
264
+ Source URI. Supports:
265
+ - Local path: /data/refs/lung.h5ad
266
+ - GCS: gs://bucket/refs/lung.h5ad
267
+ - S3: s3://bucket/refs/lung.h5ad
268
+ cache_dir : Path
269
+ Directory for downloaded files.
270
+ force : bool, default False
271
+ Re-download even if cached.
272
+
273
+ Returns
274
+ -------
275
+ Path
276
+ Local file path.
277
+ """
278
+ if uri.startswith("gs://"):
279
+ # Extract filename from URI
280
+ filename = Path(uri[5:].split("/", 1)[1]).name if "/" in uri[5:] else "data.h5ad"
281
+ local_path = cache_dir / filename
282
+ if not local_path.exists() or force:
283
+ return _download_from_gcs(uri, local_path)
284
+ else:
285
+ logger.info(f"Using cached file: {local_path}")
286
+ return local_path
287
+
288
+ elif uri.startswith("s3://"):
289
+ filename = Path(uri[5:].split("/", 1)[1]).name if "/" in uri[5:] else "data.h5ad"
290
+ local_path = cache_dir / filename
291
+ if not local_path.exists() or force:
292
+ return _download_from_s3(uri, local_path)
293
+ else:
294
+ logger.info(f"Using cached file: {local_path}")
295
+ return local_path
296
+
297
+ else:
298
+ # Local path
299
+ local_path = Path(uri)
300
+ if not local_path.exists():
301
+ raise FileNotFoundError(f"File not found: {local_path}")
302
+ return local_path
303
+
304
+
305
+ # ============================================================================
306
+ # Main Acquisition Function
307
+ # ============================================================================
308
+
309
+ def acquire_reference(
310
+ source: str,
311
+ output: Union[str, Path],
312
+ force: bool = False,
313
+ **kwargs,
314
+ ) -> str:
315
+ """
316
+ Download reference data from a source and store to a destination.
317
+
318
+ This is the unified data acquisition function for Phase 1 of the
319
+ training workflow. It handles downloading from public databases
320
+ and storing to local filesystem or cloud storage.
321
+
322
+ Parameters
323
+ ----------
324
+ source : str
325
+ Source to download from. Supported schemes:
326
+
327
+ - ``cellxgene://dataset_key`` - CellxGene Census dataset
328
+ (e.g., "cellxgene://human_lung_cell_atlas")
329
+ - ``cellxgene://?tissue=lung&disease=normal`` - CellxGene query
330
+ - ``synapse://syn12345678`` - Synapse entity ID
331
+
332
+ output : str or Path
333
+ Destination to store the data. Supports:
334
+
335
+ - Local path: ``/data/refs/lung.h5ad``
336
+ - GCS: ``gs://bucket/refs/lung.h5ad``
337
+ - S3: ``s3://bucket/refs/lung.h5ad``
338
+
339
+ force : bool, default False
340
+ Re-download and overwrite even if output exists.
341
+
342
+ **kwargs
343
+ Source-specific options:
344
+
345
+ - ``max_cells`` (int): Maximum cells to download (for CellxGene query)
346
+ - ``auth_token`` (str): Synapse authentication token
347
+ - ``tissue``, ``disease``, ``cell_type`` (str): CellxGene query filters
348
+
349
+ Returns
350
+ -------
351
+ str
352
+ The output path/URI (same as input output parameter).
353
+ Use this path in train_and_annotate() references list.
354
+
355
+ Raises
356
+ ------
357
+ ValueError
358
+ If source scheme is not recognized.
359
+ ImportError
360
+ If required cloud SDK is not installed.
361
+
362
+ Examples
363
+ --------
364
+ >>> from spatialcore.annotation import acquire_reference
365
+ >>> # Download from CellxGene → store locally
366
+ >>> path = acquire_reference(
367
+ ... source="cellxgene://human_lung_cell_atlas",
368
+ ... output="/data/references/hlca.h5ad",
369
+ ... )
370
+ >>> print(path)
371
+ /data/references/hlca.h5ad
372
+
373
+ >>> # Download from CellxGene → store to GCS
374
+ >>> gcs_path = acquire_reference(
375
+ ... source="cellxgene://human_lung_cell_atlas",
376
+ ... output="gs://my-bucket/references/hlca.h5ad",
377
+ ... )
378
+
379
+ >>> # CellxGene query with filters → store to S3
380
+ >>> s3_path = acquire_reference(
381
+ ... source="cellxgene://?tissue=liver&disease=normal",
382
+ ... output="s3://my-bucket/references/healthy_liver.h5ad",
383
+ ... max_cells=100000,
384
+ ... )
385
+
386
+ >>> # Download from Synapse → store locally
387
+ >>> path = acquire_reference(
388
+ ... source="synapse://syn12345678",
389
+ ... output="/data/references/lung_ref.h5ad",
390
+ ... auth_token=os.environ.get("SYNAPSE_AUTH_TOKEN"),
391
+ ... )
392
+
393
+ See Also
394
+ --------
395
+ train_and_annotate : Use acquired references for training.
396
+ download_cellxgene_reference : Direct CellxGene download (low-level).
397
+ query_cellxgene_census : CellxGene query with filters (low-level).
398
+ download_synapse_reference : Direct Synapse download (low-level).
399
+
400
+ Notes
401
+ -----
402
+ **Recommended Workflow:**
403
+
404
+ 1. Run ``acquire_reference()`` once to download and store data
405
+ 2. Use the returned path in ``train_and_annotate()`` references list
406
+ 3. Cloud storage (GCS/S3) enables reproducible training across team members
407
+
408
+ **Cloud Storage Benefits:**
409
+
410
+ - Versioned datasets for reproducibility
411
+ - Team collaboration with shared references
412
+ - No local disk requirements for large datasets
413
+ """
414
+ output_str = str(output)
415
+
416
+ # Check if output exists (for cloud, we'd need to check remotely)
417
+ if not force and not output_str.startswith(("gs://", "s3://")):
418
+ local_output = Path(output_str)
419
+ if local_output.exists():
420
+ logger.info(f"Output already exists: {output_str}")
421
+ return output_str
422
+
423
+ # Parse source scheme
424
+ if source.startswith("cellxgene://"):
425
+ adata = _acquire_from_cellxgene(source, **kwargs)
426
+
427
+ elif source.startswith("synapse://"):
428
+ adata = _acquire_from_synapse(source, **kwargs)
429
+
430
+ else:
431
+ raise ValueError(
432
+ f"Unknown source scheme: {source}. "
433
+ "Supported: cellxgene://, synapse://"
434
+ )
435
+
436
+ # Write to output (local or cloud)
437
+ result_path = _write_output(adata, output)
438
+
439
+ logger.info(f"Acquisition complete: {source} → {result_path}")
440
+ return result_path
441
+
442
+
443
+ def _acquire_from_cellxgene(source: str, **kwargs) -> ad.AnnData:
444
+ """
445
+ Acquire data from CellxGene Census.
446
+
447
+ Supports two formats:
448
+ - cellxgene://dataset_key - Download predefined dataset
449
+ - cellxgene://?tissue=lung&disease=normal - Query with filters
450
+ """
451
+ from urllib.parse import parse_qs, urlparse
452
+
453
+ # Remove scheme
454
+ rest = source[12:] # len("cellxgene://") = 12
455
+
456
+ if rest.startswith("?"):
457
+ # Query format: cellxgene://?tissue=lung&disease=normal
458
+ from spatialcore.annotation.cellxgene import query_cellxgene_census
459
+
460
+ parsed = urlparse(source)
461
+ params = parse_qs(parsed.query)
462
+
463
+ # Extract query parameters
464
+ tissue = params.get("tissue", [None])[0]
465
+ disease = params.get("disease", [None])[0]
466
+ cell_type = params.get("cell_type", [None])[0]
467
+ assay = params.get("assay", [None])[0]
468
+
469
+ logger.info(f"Querying CellxGene Census with filters...")
470
+ logger.info(f" tissue={tissue}, disease={disease}, cell_type={cell_type}")
471
+
472
+ adata = query_cellxgene_census(
473
+ tissue=tissue,
474
+ disease=disease,
475
+ cell_type=cell_type,
476
+ assay=assay,
477
+ max_cells=kwargs.get("max_cells"),
478
+ random_state=kwargs.get("random_state", 42),
479
+ )
480
+
481
+ else:
482
+ # Dataset key format: cellxgene://human_lung_cell_atlas
483
+ from spatialcore.annotation.cellxgene import (
484
+ download_cellxgene_reference,
485
+ CELLXGENE_DATASETS,
486
+ )
487
+
488
+ dataset_key = rest
489
+
490
+ if dataset_key not in CELLXGENE_DATASETS:
491
+ available = ", ".join(CELLXGENE_DATASETS.keys())
492
+ raise ValueError(
493
+ f"Unknown CellxGene dataset: '{dataset_key}'. "
494
+ f"Available: {available}. "
495
+ "For custom queries, use: cellxgene://?tissue=lung&disease=normal"
496
+ )
497
+
498
+ # Download to temp location, then load
499
+ with tempfile.TemporaryDirectory() as tmp_dir:
500
+ tmp_path = download_cellxgene_reference(
501
+ dataset_key=dataset_key,
502
+ output_dir=tmp_dir,
503
+ force=True,
504
+ )
505
+ adata = ad.read_h5ad(tmp_path)
506
+
507
+ logger.info(f"Acquired from CellxGene: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
508
+ return adata
509
+
510
+
511
+ def _acquire_from_synapse(source: str, **kwargs) -> ad.AnnData:
512
+ """Acquire data from Synapse."""
513
+ from spatialcore.annotation.synapse import download_synapse_reference
514
+
515
+ # Remove scheme: synapse://syn12345678 -> syn12345678
516
+ synapse_id = source[10:] # len("synapse://") = 10
517
+
518
+ # Download to temp location
519
+ with tempfile.TemporaryDirectory() as tmp_dir:
520
+ tmp_path = download_synapse_reference(
521
+ synapse_id=synapse_id,
522
+ output_dir=Path(tmp_dir),
523
+ auth_token=kwargs.get("auth_token"),
524
+ force=True,
525
+ )
526
+ adata = ad.read_h5ad(tmp_path)
527
+
528
+ logger.info(f"Acquired from Synapse: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
529
+ return adata