PyPI - radiobject - Versions diffs - 0.1.0__py3-none-any.whl - Mend

radiobject 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

radiobject/__init__.py +24 -0
radiobject/_types.py +19 -0
radiobject/ctx.py +359 -0
radiobject/dataframe.py +186 -0
radiobject/imaging_metadata.py +387 -0
radiobject/indexing.py +45 -0
radiobject/ingest.py +132 -0
radiobject/ml/__init__.py +26 -0
radiobject/ml/cache.py +53 -0
radiobject/ml/compat/__init__.py +33 -0
radiobject/ml/compat/torchio.py +99 -0
radiobject/ml/config.py +42 -0
radiobject/ml/datasets/__init__.py +12 -0
radiobject/ml/datasets/collection_dataset.py +198 -0
radiobject/ml/datasets/multimodal.py +129 -0
radiobject/ml/datasets/patch_dataset.py +158 -0
radiobject/ml/datasets/segmentation_dataset.py +219 -0
radiobject/ml/datasets/volume_dataset.py +233 -0
radiobject/ml/distributed.py +82 -0
radiobject/ml/factory.py +249 -0
radiobject/ml/utils/__init__.py +13 -0
radiobject/ml/utils/labels.py +106 -0
radiobject/ml/utils/validation.py +85 -0
radiobject/ml/utils/worker_init.py +10 -0
radiobject/orientation.py +270 -0
radiobject/parallel.py +65 -0
radiobject/py.typed +0 -0
radiobject/query.py +788 -0
radiobject/radi_object.py +1665 -0
radiobject/streaming.py +389 -0
radiobject/utils.py +17 -0
radiobject/volume.py +438 -0
radiobject/volume_collection.py +1182 -0
radiobject-0.1.0.dist-info/METADATA +139 -0
radiobject-0.1.0.dist-info/RECORD +37 -0
radiobject-0.1.0.dist-info/WHEEL +4 -0
radiobject-0.1.0.dist-info/licenses/LICENSE +21 -0

radiobject/ml/factory.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Factory functions for creating training dataloaders."""
+from __future__ import annotations
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Sequence
+from torch.utils.data import DataLoader
+from radiobject._types import LabelSource
+from radiobject.ml.config import DatasetConfig, LoadingMode
+from radiobject.ml.datasets.collection_dataset import VolumeCollectionDataset
+from radiobject.ml.datasets.segmentation_dataset import SegmentationDataset
+from radiobject.ml.utils.worker_init import worker_init_fn
+if TYPE_CHECKING:
+    from radiobject.volume_collection import VolumeCollection
+def create_training_dataloader(
+    collections: VolumeCollection | Sequence[VolumeCollection],
+    labels: LabelSource = None,
+    batch_size: int = 4,
+    patch_size: tuple[int, int, int] | None = None,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    persistent_workers: bool = True,
+    transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+    patches_per_volume: int = 1,
+) -> DataLoader:
+    """Create a DataLoader configured for training from VolumeCollection(s).
+    Args:
+        collections: Single VolumeCollection or list for multi-modal training.
+            Multi-modal collections are stacked along channel dimension.
+        labels: Label source. Can be:
+            - str: Column name in collection's obs DataFrame
+            - pd.DataFrame: With obs_id as column/index and label values
+            - dict[str, Any]: Mapping from obs_id to label
+            - Callable[[str], Any]: Function taking obs_id, returning label
+            - None: No labels
+        batch_size: Samples per batch.
+        patch_size: If provided, extract random patches of this size.
+        num_workers: DataLoader worker processes.
+        pin_memory: Pin tensors to CUDA memory.
+        persistent_workers: Keep workers alive between epochs.
+        transform: Transform function applied to each sample.
+            MONAI dict transforms (e.g., RandFlipd) work directly.
+        patches_per_volume: Number of patches to extract per volume per epoch.
+    Returns:
+        DataLoader configured for training with shuffle enabled.
+    """
+    loading_mode = LoadingMode.PATCH if patch_size else LoadingMode.FULL_VOLUME
+    config = DatasetConfig(
+        loading_mode=loading_mode,
+        patch_size=patch_size,
+        patches_per_volume=patches_per_volume,
+    )
+    dataset = VolumeCollectionDataset(
+        collections, config=config, labels=labels, transform=transform
+    )
+    effective_workers = num_workers if num_workers > 0 else 0
+    effective_persistent = persistent_workers and effective_workers > 0
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=effective_workers,
+        pin_memory=pin_memory and effective_workers > 0,
+        persistent_workers=effective_persistent,
+        worker_init_fn=worker_init_fn if effective_workers > 0 else None,
+        drop_last=True,
+    )
+def create_validation_dataloader(
+    collections: VolumeCollection | Sequence[VolumeCollection],
+    labels: LabelSource = None,
+    batch_size: int = 4,
+    patch_size: tuple[int, int, int] | None = None,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+) -> DataLoader:
+    """Create a DataLoader configured for validation (no shuffle, no drop_last).
+    Args:
+        collections: Single VolumeCollection or list for multi-modal validation.
+        labels: Label source (see create_training_dataloader for options).
+        batch_size: Samples per batch.
+        patch_size: If provided, extract patches of this size.
+        num_workers: DataLoader worker processes.
+        pin_memory: Pin tensors to CUDA memory.
+        transform: Transform function applied to each sample.
+            MONAI dict transforms work directly.
+    Returns:
+        DataLoader configured for validation.
+    Example::
+        from monai.transforms import Compose, NormalizeIntensityd
+        transform = Compose([NormalizeIntensityd(keys="image")])
+        loader = create_validation_dataloader(radi.CT, labels="has_tumor", transform=transform)
+    """
+    loading_mode = LoadingMode.PATCH if patch_size else LoadingMode.FULL_VOLUME
+    config = DatasetConfig(
+        loading_mode=loading_mode,
+        patch_size=patch_size,
+        patches_per_volume=1,
+    )
+    dataset = VolumeCollectionDataset(
+        collections, config=config, labels=labels, transform=transform
+    )
+    effective_workers = num_workers if num_workers > 0 else 0
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=effective_workers,
+        pin_memory=pin_memory and effective_workers > 0,
+        worker_init_fn=worker_init_fn if effective_workers > 0 else None,
+        drop_last=False,
+    )
+def create_inference_dataloader(
+    collections: VolumeCollection | Sequence[VolumeCollection],
+    batch_size: int = 1,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+) -> DataLoader:
+    """Create a DataLoader configured for inference (full volumes, no shuffle).
+    Args:
+        collections: Single VolumeCollection or list for multi-modal inference.
+        batch_size: Samples per batch.
+        num_workers: DataLoader worker processes.
+        pin_memory: Pin tensors to CUDA memory.
+        transform: Transform function applied to each sample.
+    Returns:
+        DataLoader configured for inference.
+    Example::
+        from monai.transforms import NormalizeIntensityd
+        transform = NormalizeIntensityd(keys="image")
+        loader = create_inference_dataloader(radi.CT, transform=transform)
+    """
+    config = DatasetConfig(loading_mode=LoadingMode.FULL_VOLUME)
+    dataset = VolumeCollectionDataset(collections, config=config, transform=transform)
+    effective_workers = num_workers if num_workers > 0 else 0
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=effective_workers,
+        pin_memory=pin_memory and effective_workers > 0,
+        worker_init_fn=worker_init_fn if effective_workers > 0 else None,
+    )
+def create_segmentation_dataloader(
+    image: VolumeCollection,
+    mask: VolumeCollection,
+    batch_size: int = 4,
+    patch_size: tuple[int, int, int] | None = None,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    persistent_workers: bool = True,
+    image_transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+    spatial_transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+    foreground_sampling: bool = False,
+    foreground_threshold: float = 0.01,
+    patches_per_volume: int = 1,
+) -> DataLoader:
+    """Create a DataLoader for segmentation training with separate image/mask handling.
+    Unlike create_training_dataloader which stacks collections as channels, this
+    returns separate "image" and "mask" tensors. This is cleaner for segmentation
+    workflows where different transforms need to be applied to images vs masks.
+    Args:
+        image: VolumeCollection containing input images (CT, MRI, etc.).
+        mask: VolumeCollection containing segmentation masks.
+        batch_size: Samples per batch.
+        patch_size: If provided, extract random patches of this size.
+        num_workers: DataLoader worker processes.
+        pin_memory: Pin tensors to CUDA memory.
+        persistent_workers: Keep workers alive between epochs.
+        image_transform: Transform applied only to "image" key (e.g., normalization).
+        spatial_transform: Transform applied to both "image" and "mask" keys
+            (e.g., random flips, rotations).
+        foreground_sampling: If True, bias patch sampling toward regions with
+            foreground (non-zero mask values).
+        foreground_threshold: Minimum fraction of foreground voxels in patch
+            when foreground_sampling is enabled.
+        patches_per_volume: Number of patches to extract per volume per epoch.
+    Returns:
+        DataLoader yielding {"image": (B,1,X,Y,Z), "mask": (B,1,X,Y,Z), ...}
+    """
+    loading_mode = LoadingMode.PATCH if patch_size else LoadingMode.FULL_VOLUME
+    config = DatasetConfig(
+        loading_mode=loading_mode,
+        patch_size=patch_size,
+        patches_per_volume=patches_per_volume,
+    )
+    dataset = SegmentationDataset(
+        image=image,
+        mask=mask,
+        config=config,
+        image_transform=image_transform,
+        spatial_transform=spatial_transform,
+        foreground_sampling=foreground_sampling,
+        foreground_threshold=foreground_threshold,
+    )
+    effective_workers = num_workers if num_workers > 0 else 0
+    effective_persistent = persistent_workers and effective_workers > 0
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=effective_workers,
+        pin_memory=pin_memory and effective_workers > 0,
+        persistent_workers=effective_persistent,
+        worker_init_fn=worker_init_fn if effective_workers > 0 else None,
+        drop_last=True,
+    )

radiobject/ml/utils/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""ML utilities."""
+from radiobject.ml.utils.labels import LabelSource, load_labels
+from radiobject.ml.utils.validation import validate_collection_alignment, validate_uniform_shapes
+from radiobject.ml.utils.worker_init import worker_init_fn
+__all__ = [
+    "LabelSource",
+    "load_labels",
+    "validate_collection_alignment",
+    "validate_uniform_shapes",
+    "worker_init_fn",
+]

radiobject/ml/utils/labels.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Label loading utilities for ML datasets."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+import pandas as pd
+from radiobject._types import LabelSource
+__all__ = ["LabelSource", "load_labels"]
+if TYPE_CHECKING:
+    from radiobject.volume_collection import VolumeCollection
+def load_labels(
+    collection: VolumeCollection,
+    labels: LabelSource,
+    obs_df: pd.DataFrame | None = None,
+) -> dict[int, Any] | None:
+    """Load labels from various sources, indexed by volume position.
+    Args:
+        collection: VolumeCollection for the primary collection (used to get obs_ids).
+        labels: Label source - see LabelSource type for options.
+        obs_df: Pre-loaded obs DataFrame from the collection. Required when
+            labels is a column name string.
+    Returns:
+        Dict mapping volume index to label value, or None if labels is None.
+    Raises:
+        ValueError: If label column not found or DataFrame missing required columns.
+    """
+    if labels is None:
+        return None
+    obs_ids = collection.obs_ids
+    n_volumes = len(obs_ids)
+    result: dict[int, Any] = {}
+    if isinstance(labels, str):
+        # Column name in obs DataFrame
+        if obs_df is None:
+            raise ValueError(
+                "obs_df required when labels is a column name. "
+                "Pass collection.obs.read() as obs_df."
+            )
+        if labels not in obs_df.columns:
+            raise ValueError(f"Label column '{labels}' not found in obs DataFrame")
+        # Build lookup by obs_id
+        if "obs_id" in obs_df.columns:
+            label_lookup = dict(zip(obs_df["obs_id"], obs_df[labels]))
+        elif obs_df.index.name == "obs_id":
+            label_lookup = obs_df[labels].to_dict()
+        else:
+            raise ValueError("obs DataFrame must have 'obs_id' column or index")
+        for idx in range(n_volumes):
+            obs_id = obs_ids[idx]
+            if obs_id in label_lookup:
+                result[idx] = label_lookup[obs_id]
+    elif isinstance(labels, pd.DataFrame):
+        # DataFrame with obs_id mapping
+        if "obs_id" in labels.columns:
+            # obs_id as column - use first non-obs_id column as label
+            label_cols = [c for c in labels.columns if c != "obs_id"]
+            if not label_cols:
+                raise ValueError("Labels DataFrame must have at least one label column")
+            label_col = label_cols[0]
+            label_lookup = dict(zip(labels["obs_id"], labels[label_col]))
+        elif labels.index.name == "obs_id" or labels.index.dtype == object:
+            # obs_id as index
+            label_col = labels.columns[0]
+            label_lookup = labels[label_col].to_dict()
+        else:
+            raise ValueError("Labels DataFrame must have 'obs_id' as column or index")
+        for idx in range(n_volumes):
+            obs_id = obs_ids[idx]
+            if obs_id in label_lookup:
+                result[idx] = label_lookup[obs_id]
+    elif isinstance(labels, dict):
+        # Direct mapping from obs_id to label
+        for idx in range(n_volumes):
+            obs_id = obs_ids[idx]
+            if obs_id in labels:
+                result[idx] = labels[obs_id]
+    elif callable(labels):
+        # Function that takes obs_id and returns label
+        for idx in range(n_volumes):
+            obs_id = obs_ids[idx]
+            result[idx] = labels(obs_id)
+    else:
+        raise TypeError(
+            f"labels must be str, DataFrame, dict, callable, or None, got {type(labels)}"
+        )
+    return result if result else None

radiobject/ml/utils/validation.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Validation utilities for ML datasets."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from radiobject.volume_collection import VolumeCollection
+def validate_collection_alignment(collections: dict[str, VolumeCollection]) -> None:
+    """Validate all collections have matching subjects by obs_subject_id.
+    For multi-modal training, volumes from different collections must correspond
+    to the same subjects. This validates alignment using the obs_subject_id field
+    directly (no string parsing).
+    Args:
+        collections: Dict mapping collection names to VolumeCollection instances.
+    Raises:
+        ValueError: If collections have different volume counts or mismatched subjects.
+    """
+    if len(collections) < 2:
+        return
+    names = list(collections.keys())
+    first_name = names[0]
+    first_coll = collections[first_name]
+    n_volumes = len(first_coll)
+    first_subjects = set(first_coll.obs_subject_ids)
+    for name in names[1:]:
+        coll = collections[name]
+        if len(coll) != n_volumes:
+            raise ValueError(f"Collection '{name}' has {len(coll)} volumes, expected {n_volumes}")
+        mod_subjects = set(coll.obs_subject_ids)
+        if mod_subjects != first_subjects:
+            missing = first_subjects - mod_subjects
+            extra = mod_subjects - first_subjects
+            raise ValueError(
+                f"Subject mismatch for collection '{name}': "
+                f"missing={list(missing)[:3]}, extra={list(extra)[:3]}"
+            )
+def validate_uniform_shapes(collections: dict[str, VolumeCollection]) -> tuple[int, int, int]:
+    """Validate all collections have uniform shapes and return the common shape.
+    Args:
+        collections: Dict mapping collection names to VolumeCollection instances.
+    Returns:
+        Common volume shape (X, Y, Z).
+    Raises:
+        ValueError: If any collection has non-uniform shapes or shapes don't match.
+    """
+    shape: tuple[int, int, int] | None = None
+    for name, coll in collections.items():
+        if not coll.is_uniform:
+            raise ValueError(
+                f"Collection '{name}' has heterogeneous shapes. "
+                f"Resample to uniform dimensions before ML training."
+            )
+        coll_shape = coll.shape
+        if coll_shape is None:
+            raise ValueError(f"Collection '{name}' has no shape metadata.")
+        if shape is None:
+            shape = coll_shape
+        elif coll_shape != shape:
+            raise ValueError(
+                f"Shape mismatch: collection '{name}' has shape {coll_shape}, " f"expected {shape}"
+            )
+    if shape is None:
+        raise ValueError("No collections provided")
+    return shape

radiobject/ml/utils/worker_init.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Worker initialization for DataLoader multiprocessing."""
+from __future__ import annotations
+from radiobject.parallel import create_worker_ctx
+def worker_init_fn(worker_id: int) -> None:
+    """Initialize TileDB context for each DataLoader worker."""
+    _ = create_worker_ctx()