spatialcore 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. spatialcore/__init__.py +122 -0
  2. spatialcore/annotation/__init__.py +253 -0
  3. spatialcore/annotation/acquisition.py +529 -0
  4. spatialcore/annotation/annotate.py +603 -0
  5. spatialcore/annotation/cellxgene.py +365 -0
  6. spatialcore/annotation/confidence.py +802 -0
  7. spatialcore/annotation/discovery.py +529 -0
  8. spatialcore/annotation/expression.py +363 -0
  9. spatialcore/annotation/loading.py +529 -0
  10. spatialcore/annotation/markers.py +297 -0
  11. spatialcore/annotation/ontology.py +1282 -0
  12. spatialcore/annotation/patterns.py +247 -0
  13. spatialcore/annotation/pipeline.py +620 -0
  14. spatialcore/annotation/synapse.py +380 -0
  15. spatialcore/annotation/training.py +1457 -0
  16. spatialcore/annotation/validation.py +422 -0
  17. spatialcore/core/__init__.py +34 -0
  18. spatialcore/core/cache.py +118 -0
  19. spatialcore/core/logging.py +135 -0
  20. spatialcore/core/metadata.py +149 -0
  21. spatialcore/core/utils.py +768 -0
  22. spatialcore/data/gene_mappings/ensembl_to_hugo_human.tsv +86372 -0
  23. spatialcore/data/markers/canonical_markers.json +83 -0
  24. spatialcore/data/ontology_mappings/ontology_index.json +63865 -0
  25. spatialcore/plotting/__init__.py +109 -0
  26. spatialcore/plotting/benchmark.py +477 -0
  27. spatialcore/plotting/celltype.py +329 -0
  28. spatialcore/plotting/confidence.py +413 -0
  29. spatialcore/plotting/spatial.py +505 -0
  30. spatialcore/plotting/utils.py +411 -0
  31. spatialcore/plotting/validation.py +1342 -0
  32. spatialcore-0.1.9.dist-info/METADATA +213 -0
  33. spatialcore-0.1.9.dist-info/RECORD +36 -0
  34. spatialcore-0.1.9.dist-info/WHEEL +5 -0
  35. spatialcore-0.1.9.dist-info/licenses/LICENSE +201 -0
  36. spatialcore-0.1.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,529 @@
1
+ """
2
+ Training data discovery for local and cloud storage.
3
+
4
+ This module provides unified data discovery across:
5
+ 1. Local filesystem paths
6
+ 2. Google Cloud Storage (gs://) paths
7
+
8
+ The discovery function auto-detects path type and returns
9
+ consistent metadata regardless of storage backend.
10
+
11
+ Examples
12
+ --------
13
+ >>> from spatialcore.annotation import discover_training_data
14
+ >>> # Local discovery
15
+ >>> datasets = discover_training_data("/data/references/liver/")
16
+ >>> for ds in datasets:
17
+ ... print(f"{ds.name}: {ds.size_human}")
18
+
19
+ >>> # GCS discovery
20
+ >>> datasets = discover_training_data("gs://my-bucket/cellxgene/")
21
+ >>> len(datasets)
22
+ 5
23
+ """
24
+
25
+ from pathlib import Path
26
+ from typing import List, Optional, Union, Literal
27
+ from dataclasses import dataclass
28
+ from datetime import datetime
29
+ import re
30
+ import time
31
+
32
+ from spatialcore.core.logging import get_logger
33
+
34
+ logger = get_logger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class DiscoveredDataset:
39
+ """Metadata for a discovered training dataset.
40
+
41
+ Attributes
42
+ ----------
43
+ path : str
44
+ Full path to the dataset (local path or gs:// URL).
45
+ name : str
46
+ Dataset name (filename stem without extension).
47
+ size_bytes : int, optional
48
+ File size in bytes (None if unavailable).
49
+ size_human : str
50
+ Human-readable size (e.g., "2.3 GB").
51
+ storage_type : str
52
+ Storage backend: "local" or "gcs".
53
+ last_modified : str, optional
54
+ ISO timestamp of last modification (None if unavailable).
55
+ """
56
+
57
+ path: str
58
+ name: str
59
+ size_bytes: Optional[int]
60
+ size_human: str
61
+ storage_type: Literal["local", "gcs"]
62
+ last_modified: Optional[str]
63
+
64
+ def __repr__(self) -> str:
65
+ return (
66
+ f"DiscoveredDataset(name='{self.name}', "
67
+ f"size='{self.size_human}', type='{self.storage_type}')"
68
+ )
69
+
70
+
71
+ def discover_training_data(
72
+ path: Union[str, Path],
73
+ pattern: str = "*.h5ad",
74
+ recursive: bool = False,
75
+ ) -> List[DiscoveredDataset]:
76
+ """
77
+ Discover available training data files at a path.
78
+
79
+ Supports both local filesystem paths and Google Cloud Storage (gs://) paths.
80
+ Auto-detects path type based on prefix.
81
+
82
+ Parameters
83
+ ----------
84
+ path : str or Path
85
+ Directory to search. Supports:
86
+ - Local paths: "/data/references/", "C:/Data/references/"
87
+ - GCS paths: "gs://bucket-name/references/"
88
+ pattern : str, default "*.h5ad"
89
+ Glob pattern for matching files (e.g., "*.h5ad", "*.parquet").
90
+ recursive : bool, default False
91
+ If True, search subdirectories recursively.
92
+
93
+ Returns
94
+ -------
95
+ List[DiscoveredDataset]
96
+ List of discovered datasets with metadata, sorted by name.
97
+
98
+ Raises
99
+ ------
100
+ FileNotFoundError
101
+ If local path does not exist.
102
+ ValueError
103
+ If GCS path is malformed.
104
+ ImportError
105
+ If google-cloud-storage is required but not installed.
106
+ PermissionError
107
+ If access to GCS path is denied.
108
+
109
+ Examples
110
+ --------
111
+ >>> from spatialcore.annotation import discover_training_data
112
+ >>> # Local discovery
113
+ >>> datasets = discover_training_data("./references/")
114
+ >>> for ds in datasets:
115
+ ... print(f"{ds.name}: {ds.size_human}")
116
+ healthy_liver: 2.3 GB
117
+ hcc_liver: 1.8 GB
118
+
119
+ >>> # GCS discovery (requires google-cloud-storage)
120
+ >>> datasets = discover_training_data("gs://my-bucket/cellxgene/")
121
+ >>> len(datasets)
122
+ 5
123
+
124
+ >>> # Recursive search
125
+ >>> datasets = discover_training_data("./references/", recursive=True)
126
+
127
+ See Also
128
+ --------
129
+ download_cellxgene_reference : Download pre-configured CellxGene datasets.
130
+ list_available_datasets : List pre-configured CellxGene dataset IDs.
131
+ """
132
+ path_str = str(path)
133
+
134
+ if path_str.startswith("gs://"):
135
+ return _discover_gcs_with_retry(path_str, pattern, recursive)
136
+ else:
137
+ return _discover_local(Path(path_str), pattern, recursive)
138
+
139
+
140
+ def _discover_local(
141
+ path: Path,
142
+ pattern: str,
143
+ recursive: bool,
144
+ ) -> List[DiscoveredDataset]:
145
+ """Discover datasets on local filesystem."""
146
+ if not path.exists():
147
+ raise FileNotFoundError(f"Path does not exist: {path}")
148
+
149
+ if not path.is_dir():
150
+ raise ValueError(f"Path is not a directory: {path}")
151
+
152
+ glob_method = path.rglob if recursive else path.glob
153
+ files = sorted(glob_method(pattern))
154
+
155
+ datasets = []
156
+ for f in files:
157
+ try:
158
+ stat = f.stat()
159
+ size_bytes = stat.st_size
160
+ last_modified = _format_timestamp(stat.st_mtime)
161
+ except OSError:
162
+ size_bytes = None
163
+ last_modified = None
164
+
165
+ datasets.append(
166
+ DiscoveredDataset(
167
+ path=str(f.resolve()),
168
+ name=f.stem,
169
+ size_bytes=size_bytes,
170
+ size_human=_format_size(size_bytes) if size_bytes else "unknown",
171
+ storage_type="local",
172
+ last_modified=last_modified,
173
+ )
174
+ )
175
+
176
+ logger.info(f"Discovered {len(datasets)} datasets at {path}")
177
+ return datasets
178
+
179
+
180
+ def _discover_gcs(
181
+ path: str,
182
+ pattern: str,
183
+ recursive: bool,
184
+ ) -> List[DiscoveredDataset]:
185
+ """Discover datasets on Google Cloud Storage."""
186
+ try:
187
+ from google.cloud import storage
188
+ except ImportError:
189
+ raise ImportError(
190
+ "google-cloud-storage is required for GCS paths. "
191
+ "Install with: pip install google-cloud-storage"
192
+ )
193
+
194
+ # Parse gs://bucket/prefix
195
+ match = re.match(r"gs://([^/]+)(?:/(.*))?", path)
196
+ if not match:
197
+ raise ValueError(f"Invalid GCS path format: {path}. Expected gs://bucket/prefix")
198
+
199
+ bucket_name = match.group(1)
200
+ prefix = match.group(2) or ""
201
+ if prefix and not prefix.endswith("/"):
202
+ prefix += "/"
203
+
204
+ # Convert glob pattern to regex for matching
205
+ pattern_regex = _glob_to_regex(pattern)
206
+
207
+ client = storage.Client()
208
+ bucket = client.bucket(bucket_name)
209
+
210
+ # List blobs - use delimiter only if not recursive
211
+ delimiter = None if recursive else "/"
212
+ blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)
213
+
214
+ datasets = []
215
+ for blob in blobs:
216
+ # Skip "directory" markers
217
+ if blob.name.endswith("/"):
218
+ continue
219
+
220
+ # Check if matches pattern
221
+ filename = blob.name.split("/")[-1]
222
+ if not re.match(pattern_regex, filename):
223
+ continue
224
+
225
+ datasets.append(
226
+ DiscoveredDataset(
227
+ path=f"gs://{bucket_name}/{blob.name}",
228
+ name=Path(filename).stem,
229
+ size_bytes=blob.size,
230
+ size_human=_format_size(blob.size) if blob.size else "unknown",
231
+ storage_type="gcs",
232
+ last_modified=blob.updated.isoformat() if blob.updated else None,
233
+ )
234
+ )
235
+
236
+ datasets.sort(key=lambda d: d.name)
237
+ logger.info(f"Discovered {len(datasets)} datasets at {path}")
238
+ return datasets
239
+
240
+
241
+ def _discover_gcs_with_retry(
242
+ path: str,
243
+ pattern: str,
244
+ recursive: bool,
245
+ max_retries: int = 3,
246
+ backoff_seconds: float = 1.0,
247
+ ) -> List[DiscoveredDataset]:
248
+ """GCS discovery with exponential backoff retry."""
249
+ try:
250
+ from google.api_core import exceptions as gcs_exceptions
251
+ except ImportError:
252
+ # If we can't import google.api_core, just try once without retry
253
+ return _discover_gcs(path, pattern, recursive)
254
+
255
+ last_error = None
256
+ for attempt in range(max_retries):
257
+ try:
258
+ return _discover_gcs(path, pattern, recursive)
259
+ except gcs_exceptions.ServiceUnavailable as e:
260
+ last_error = e
261
+ if attempt < max_retries - 1:
262
+ wait_time = backoff_seconds * (2**attempt)
263
+ logger.warning(f"GCS unavailable, retrying in {wait_time}s...")
264
+ time.sleep(wait_time)
265
+ except gcs_exceptions.Forbidden as e:
266
+ raise PermissionError(f"Access denied to GCS path: {path}. {e}")
267
+ except gcs_exceptions.NotFound as e:
268
+ raise FileNotFoundError(f"GCS path not found: {path}. {e}")
269
+
270
+ raise ConnectionError(
271
+ f"GCS unavailable after {max_retries} retries: {last_error}"
272
+ )
273
+
274
+
275
+ def _format_size(size_bytes: Optional[int]) -> str:
276
+ """Format bytes as human-readable string."""
277
+ if size_bytes is None:
278
+ return "unknown"
279
+
280
+ size = float(size_bytes)
281
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
282
+ if size < 1024:
283
+ return f"{size:.1f} {unit}"
284
+ size /= 1024
285
+ return f"{size:.1f} PB"
286
+
287
+
288
+ def _format_timestamp(timestamp: float) -> str:
289
+ """Format Unix timestamp as ISO string."""
290
+ return datetime.fromtimestamp(timestamp).isoformat()
291
+
292
+
293
+ def _glob_to_regex(pattern: str) -> str:
294
+ """Convert glob pattern to regex.
295
+
296
+ Examples
297
+ --------
298
+ >>> _glob_to_regex("*.h5ad")
299
+ '^.*\\.h5ad$'
300
+ >>> _glob_to_regex("data_?.csv")
301
+ '^data_.\\.csv$'
302
+ """
303
+ # Escape special regex chars except * and ?
304
+ pattern = re.escape(pattern)
305
+ # Convert glob wildcards to regex
306
+ pattern = pattern.replace(r"\*", ".*")
307
+ pattern = pattern.replace(r"\?", ".")
308
+ return f"^{pattern}$"
309
+
310
+
311
+ def print_discovery_summary(datasets: List[DiscoveredDataset]) -> None:
312
+ """Print a formatted summary of discovered datasets.
313
+
314
+ Parameters
315
+ ----------
316
+ datasets : List[DiscoveredDataset]
317
+ List of discovered datasets from discover_training_data.
318
+
319
+ Examples
320
+ --------
321
+ >>> datasets = discover_training_data("./references/")
322
+ >>> print_discovery_summary(datasets)
323
+ Found 3 datasets:
324
+ 1. healthy_liver 2.3 GB (local)
325
+ 2. hcc_liver 1.8 GB (local)
326
+ 3. colon_atlas 5.1 GB (local)
327
+ Total: 9.2 GB
328
+ """
329
+ if not datasets:
330
+ print("No datasets found.")
331
+ return
332
+
333
+ print(f"Found {len(datasets)} datasets:")
334
+ total_bytes = 0
335
+ for i, ds in enumerate(datasets, 1):
336
+ storage_tag = f"({ds.storage_type})"
337
+ print(f" {i}. {ds.name:<20} {ds.size_human:>10} {storage_tag}")
338
+ if ds.size_bytes:
339
+ total_bytes += ds.size_bytes
340
+
341
+ if total_bytes > 0:
342
+ print(f"Total: {_format_size(total_bytes)}")
343
+
344
+
345
+ # ============================================================================
346
+ # Local Metadata CSV Support
347
+ # ============================================================================
348
+
349
+ def load_local_metadata(
350
+ metadata_csv: Union[str, Path],
351
+ sample_csv: Optional[Union[str, Path]] = None,
352
+ ) -> "tuple[pd.DataFrame, Optional[pd.DataFrame]]":
353
+ """
354
+ Load local scRNAseq metadata and sample summaries.
355
+
356
+ Metadata CSV should contain columns like:
357
+ - file_path: Path to h5ad file
358
+ - tissue: Tissue type
359
+ - condition: Disease/healthy status
360
+ - n_cells: Number of cells
361
+ - label_column: Cell type column name
362
+
363
+ Parameters
364
+ ----------
365
+ metadata_csv : str or Path
366
+ Path to metadata CSV file.
367
+ sample_csv : str or Path, optional
368
+ Path to sample-level summary CSV.
369
+
370
+ Returns
371
+ -------
372
+ Tuple[pd.DataFrame, Optional[pd.DataFrame]]
373
+ (metadata DataFrame, sample DataFrame or None)
374
+
375
+ Examples
376
+ --------
377
+ >>> from spatialcore.annotation.discovery import load_local_metadata
378
+ >>> meta, samples = load_local_metadata("references_metadata.csv")
379
+ >>> print(meta.columns.tolist())
380
+ ['file_path', 'tissue', 'condition', 'n_cells', 'label_column']
381
+ """
382
+ import pandas as pd
383
+
384
+ metadata_csv = Path(metadata_csv)
385
+ if not metadata_csv.exists():
386
+ raise FileNotFoundError(f"Metadata CSV not found: {metadata_csv}")
387
+
388
+ metadata_df = pd.read_csv(metadata_csv)
389
+ logger.info(f"Loaded metadata: {len(metadata_df)} entries from {metadata_csv}")
390
+
391
+ sample_df = None
392
+ if sample_csv is not None:
393
+ sample_csv = Path(sample_csv)
394
+ if sample_csv.exists():
395
+ sample_df = pd.read_csv(sample_csv)
396
+ logger.info(f"Loaded sample summary: {len(sample_df)} entries")
397
+ else:
398
+ logger.warning(f"Sample CSV not found: {sample_csv}")
399
+
400
+ return metadata_df, sample_df
401
+
402
+
403
+ def query_local_references(
404
+ metadata_df: "pd.DataFrame",
405
+ tissue: Optional[str] = None,
406
+ condition: Optional[str] = None,
407
+ min_cells: int = 1000,
408
+ file_column: str = "file_path",
409
+ tissue_column: str = "tissue",
410
+ condition_column: str = "condition",
411
+ cells_column: str = "n_cells",
412
+ ) -> "pd.DataFrame":
413
+ """
414
+ Query local references by tissue/condition filters.
415
+
416
+ Parameters
417
+ ----------
418
+ metadata_df : pd.DataFrame
419
+ Metadata DataFrame from load_local_metadata.
420
+ tissue : str, optional
421
+ Filter by tissue type (case-insensitive substring match).
422
+ condition : str, optional
423
+ Filter by condition (e.g., "healthy", "cancer").
424
+ min_cells : int, default 1000
425
+ Minimum cells required.
426
+ file_column : str, default "file_path"
427
+ Column containing file paths.
428
+ tissue_column : str, default "tissue"
429
+ Column containing tissue types.
430
+ condition_column : str, default "condition"
431
+ Column containing conditions.
432
+ cells_column : str, default "n_cells"
433
+ Column containing cell counts.
434
+
435
+ Returns
436
+ -------
437
+ pd.DataFrame
438
+ Filtered metadata DataFrame.
439
+
440
+ Examples
441
+ --------
442
+ >>> from spatialcore.annotation.discovery import load_local_metadata, query_local_references
443
+ >>> meta, _ = load_local_metadata("references_metadata.csv")
444
+ >>> liver_refs = query_local_references(meta, tissue="liver", min_cells=5000)
445
+ >>> print(liver_refs[[file_column, tissue_column, cells_column]])
446
+ """
447
+ result = metadata_df.copy()
448
+
449
+ # Filter by tissue
450
+ if tissue is not None and tissue_column in result.columns:
451
+ tissue_lower = tissue.lower()
452
+ mask = result[tissue_column].astype(str).str.lower().str.contains(tissue_lower, na=False)
453
+ result = result[mask]
454
+ logger.info(f" Filtered by tissue '{tissue}': {len(result)} remaining")
455
+
456
+ # Filter by condition
457
+ if condition is not None and condition_column in result.columns:
458
+ condition_lower = condition.lower()
459
+ mask = result[condition_column].astype(str).str.lower().str.contains(condition_lower, na=False)
460
+ result = result[mask]
461
+ logger.info(f" Filtered by condition '{condition}': {len(result)} remaining")
462
+
463
+ # Filter by min cells
464
+ if cells_column in result.columns:
465
+ result = result[result[cells_column] >= min_cells]
466
+ logger.info(f" Filtered by min_cells={min_cells}: {len(result)} remaining")
467
+
468
+ return result
469
+
470
+
471
+ def create_metadata_template(
472
+ output_path: Union[str, Path],
473
+ discovered_datasets: Optional[List[DiscoveredDataset]] = None,
474
+ ) -> Path:
475
+ """
476
+ Create a metadata CSV template for local references.
477
+
478
+ Optionally pre-populate with discovered datasets.
479
+
480
+ Parameters
481
+ ----------
482
+ output_path : str or Path
483
+ Path to save template CSV.
484
+ discovered_datasets : List[DiscoveredDataset], optional
485
+ Pre-populate with discovered datasets.
486
+
487
+ Returns
488
+ -------
489
+ Path
490
+ Path to created template.
491
+
492
+ Examples
493
+ --------
494
+ >>> from spatialcore.annotation.discovery import discover_training_data, create_metadata_template
495
+ >>> datasets = discover_training_data("./references/")
496
+ >>> create_metadata_template("metadata_template.csv", datasets)
497
+ """
498
+ import pandas as pd
499
+
500
+ output_path = Path(output_path)
501
+
502
+ if discovered_datasets:
503
+ data = [
504
+ {
505
+ "file_path": ds.path,
506
+ "name": ds.name,
507
+ "tissue": "",
508
+ "condition": "",
509
+ "n_cells": None,
510
+ "label_column": "cell_type",
511
+ "notes": "",
512
+ }
513
+ for ds in discovered_datasets
514
+ ]
515
+ df = pd.DataFrame(data)
516
+ else:
517
+ df = pd.DataFrame(columns=[
518
+ "file_path",
519
+ "name",
520
+ "tissue",
521
+ "condition",
522
+ "n_cells",
523
+ "label_column",
524
+ "notes",
525
+ ])
526
+
527
+ df.to_csv(output_path, index=False)
528
+ logger.info(f"Created metadata template: {output_path}")
529
+ return output_path