PyPI - radiobject - Versions diffs - 0.1.0__py3-none-any.whl - Mend

radiobject 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

radiobject/__init__.py +24 -0
radiobject/_types.py +19 -0
radiobject/ctx.py +359 -0
radiobject/dataframe.py +186 -0
radiobject/imaging_metadata.py +387 -0
radiobject/indexing.py +45 -0
radiobject/ingest.py +132 -0
radiobject/ml/__init__.py +26 -0
radiobject/ml/cache.py +53 -0
radiobject/ml/compat/__init__.py +33 -0
radiobject/ml/compat/torchio.py +99 -0
radiobject/ml/config.py +42 -0
radiobject/ml/datasets/__init__.py +12 -0
radiobject/ml/datasets/collection_dataset.py +198 -0
radiobject/ml/datasets/multimodal.py +129 -0
radiobject/ml/datasets/patch_dataset.py +158 -0
radiobject/ml/datasets/segmentation_dataset.py +219 -0
radiobject/ml/datasets/volume_dataset.py +233 -0
radiobject/ml/distributed.py +82 -0
radiobject/ml/factory.py +249 -0
radiobject/ml/utils/__init__.py +13 -0
radiobject/ml/utils/labels.py +106 -0
radiobject/ml/utils/validation.py +85 -0
radiobject/ml/utils/worker_init.py +10 -0
radiobject/orientation.py +270 -0
radiobject/parallel.py +65 -0
radiobject/py.typed +0 -0
radiobject/query.py +788 -0
radiobject/radi_object.py +1665 -0
radiobject/streaming.py +389 -0
radiobject/utils.py +17 -0
radiobject/volume.py +438 -0
radiobject/volume_collection.py +1182 -0
radiobject-0.1.0.dist-info/METADATA +139 -0
radiobject-0.1.0.dist-info/RECORD +37 -0
radiobject-0.1.0.dist-info/WHEEL +4 -0
radiobject-0.1.0.dist-info/licenses/LICENSE +21 -0

radiobject/ml/datasets/segmentation_dataset.py ADDED Viewed

@@ -0,0 +1,219 @@
+"""SegmentationDataset - specialized dataset for image/mask segmentation training."""
+from __future__ import annotations
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from radiobject.ml.config import DatasetConfig, LoadingMode
+from radiobject.ml.utils.validation import validate_collection_alignment, validate_uniform_shapes
+if TYPE_CHECKING:
+    from radiobject.volume_collection import VolumeCollection
+class SegmentationDataset(Dataset):
+    """PyTorch Dataset for segmentation training with explicit image/mask separation."""
+    def __init__(
+        self,
+        image: VolumeCollection,
+        mask: VolumeCollection,
+        config: DatasetConfig | None = None,
+        image_transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+        spatial_transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+        foreground_sampling: bool = False,
+        foreground_threshold: float = 0.01,
+        foreground_max_retries: int = 10,
+    ):
+        """Initialize segmentation dataset.
+        Args:
+            image: VolumeCollection containing input images (CT, MRI, etc.).
+            mask: VolumeCollection containing segmentation masks.
+            config: Dataset configuration (loading mode, patch size, etc.).
+            image_transform: Transform applied to image only (e.g., normalization).
+                Should operate on keys=["image"].
+            spatial_transform: Transform applied to both image and mask (e.g., flips).
+                Should operate on keys=["image", "mask"].
+            foreground_sampling: If True, bias patch sampling toward regions with
+                foreground (non-zero mask values).
+            foreground_threshold: Minimum fraction of foreground voxels in patch
+                when foreground_sampling is enabled.
+            foreground_max_retries: Maximum random attempts before accepting any patch.
+        """
+        self._config = config or DatasetConfig()
+        self._image_transform = image_transform
+        self._spatial_transform = spatial_transform
+        self._foreground_sampling = foreground_sampling
+        self._foreground_threshold = foreground_threshold
+        self._foreground_max_retries = foreground_max_retries
+        # Store collections directly
+        self._image = image
+        self._mask = mask
+        # Cache obs_ids and obs_subject_ids for fast access
+        self._obs_ids = image.obs_ids
+        self._obs_subject_ids = image.obs_subject_ids
+        # Validate alignment between image and mask collections
+        collections = {"image": self._image, "mask": self._mask}
+        validate_collection_alignment(collections)
+        # Validate uniform shapes
+        self._volume_shape = validate_uniform_shapes(collections)
+        self._n_volumes = len(self._image)
+        # Compute dataset length
+        if self._config.loading_mode == LoadingMode.PATCH:
+            self._length = self._n_volumes * self._config.patches_per_volume
+        elif self._config.loading_mode == LoadingMode.SLICE_2D:
+            self._length = self._n_volumes * self._volume_shape[2]
+        else:
+            self._length = self._n_volumes
+    def __len__(self) -> int:
+        return self._length
+    def __getitem__(self, idx: int) -> dict[str, Any]:
+        if self._config.loading_mode == LoadingMode.PATCH:
+            return self._get_patch_item(idx)
+        elif self._config.loading_mode == LoadingMode.SLICE_2D:
+            return self._get_slice_item(idx)
+        else:
+            return self._get_full_volume_item(idx)
+    def _get_full_volume_item(self, idx: int) -> dict[str, Any]:
+        """Load full volume for image and mask."""
+        image_data = self._image.iloc[idx].to_numpy()
+        mask_data = self._mask.iloc[idx].to_numpy()
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(image_data).unsqueeze(0),
+            "mask": torch.from_numpy(mask_data).unsqueeze(0),
+            "idx": idx,
+            "obs_id": self._obs_ids[idx],
+            "obs_subject_id": self._obs_subject_ids[idx],
+        }
+        return self._apply_transforms(result)
+    def _get_patch_item(self, idx: int) -> dict[str, Any]:
+        """Load a random patch from image and mask."""
+        volume_idx = idx // self._config.patches_per_volume
+        patch_idx = idx % self._config.patches_per_volume
+        patch_size = self._config.patch_size
+        assert patch_size is not None
+        max_start = tuple(max(0, self._volume_shape[i] - patch_size[i]) for i in range(3))
+        if self._foreground_sampling:
+            # Try to find a patch with sufficient foreground
+            start = self._sample_foreground_patch(volume_idx, max_start, patch_size, idx)
+        else:
+            rng = np.random.default_rng(seed=idx)
+            start = tuple(
+                rng.integers(0, max_start[i] + 1) if max_start[i] > 0 else 0 for i in range(3)
+            )
+        image_vol = self._image.iloc[volume_idx]
+        mask_vol = self._mask.iloc[volume_idx]
+        image_data = image_vol.slice(
+            slice(start[0], start[0] + patch_size[0]),
+            slice(start[1], start[1] + patch_size[1]),
+            slice(start[2], start[2] + patch_size[2]),
+        )
+        mask_data = mask_vol.slice(
+            slice(start[0], start[0] + patch_size[0]),
+            slice(start[1], start[1] + patch_size[1]),
+            slice(start[2], start[2] + patch_size[2]),
+        )
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(image_data).unsqueeze(0),
+            "mask": torch.from_numpy(mask_data).unsqueeze(0),
+            "idx": volume_idx,
+            "patch_idx": patch_idx,
+            "patch_start": start,
+            "obs_id": self._obs_ids[volume_idx],
+            "obs_subject_id": self._obs_subject_ids[volume_idx],
+        }
+        return self._apply_transforms(result)
+    def _sample_foreground_patch(
+        self,
+        volume_idx: int,
+        max_start: tuple[int, ...],
+        patch_size: tuple[int, int, int],
+        seed: int,
+    ) -> tuple[int, int, int]:
+        """Sample a patch position biased toward foreground regions."""
+        rng = np.random.default_rng(seed=seed)
+        mask_vol = self._mask.iloc[volume_idx]
+        for attempt in range(self._foreground_max_retries):
+            start = tuple(
+                rng.integers(0, max_start[i] + 1) if max_start[i] > 0 else 0 for i in range(3)
+            )
+            mask_patch = mask_vol.slice(
+                slice(start[0], start[0] + patch_size[0]),
+                slice(start[1], start[1] + patch_size[1]),
+                slice(start[2], start[2] + patch_size[2]),
+            )
+            foreground_ratio = np.count_nonzero(mask_patch) / mask_patch.size
+            if foreground_ratio >= self._foreground_threshold:
+                return start  # type: ignore[return-value]
+        # Fallback: return last sampled position
+        return start  # type: ignore[return-value]
+    def _get_slice_item(self, idx: int) -> dict[str, Any]:
+        """Load a 2D slice from image and mask."""
+        volume_idx = idx // self._volume_shape[2]
+        slice_idx = idx % self._volume_shape[2]
+        image_data = self._image.iloc[volume_idx].axial(slice_idx)
+        mask_data = self._mask.iloc[volume_idx].axial(slice_idx)
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(image_data).unsqueeze(0),
+            "mask": torch.from_numpy(mask_data).unsqueeze(0),
+            "idx": volume_idx,
+            "slice_idx": slice_idx,
+            "obs_id": self._obs_ids[volume_idx],
+            "obs_subject_id": self._obs_subject_ids[volume_idx],
+        }
+        return self._apply_transforms(result)
+    def _apply_transforms(self, result: dict[str, Any]) -> dict[str, Any]:
+        """Apply spatial and image transforms to sample dict."""
+        # Spatial transform affects both image and mask
+        if self._spatial_transform is not None:
+            result = self._spatial_transform(result)
+        # Image transform affects only image
+        if self._image_transform is not None:
+            result = self._image_transform(result)
+        return result
+    @property
+    def volume_shape(self) -> tuple[int, int, int]:
+        """Shape of each volume (X, Y, Z)."""
+        return self._volume_shape
+    @property
+    def n_volumes(self) -> int:
+        """Number of image/mask pairs."""
+        return self._n_volumes

radiobject/ml/datasets/volume_dataset.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""Core RadiObjectDataset implementation."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, Callable
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from radiobject.ml.config import DatasetConfig, LoadingMode
+if TYPE_CHECKING:
+    from radiobject.radi_object import RadiObject
+    from radiobject.volume_collection import VolumeCollection
+class RadiObjectDataset(Dataset):
+    """PyTorch Dataset for loading volumes from RadiObject via TileDB."""
+    def __init__(
+        self,
+        radi_object: RadiObject,
+        config: DatasetConfig,
+        transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+    ):
+        self._config = config
+        self._transform = transform
+        modalities = config.modalities or list(radi_object.collection_names)
+        if not modalities:
+            raise ValueError("No modalities specified and RadiObject has no collections")
+        self._modalities = modalities
+        self._collections: dict[str, VolumeCollection] = {
+            mod: radi_object.collection(mod) for mod in modalities
+        }
+        # Validate all collections have uniform shapes for batched loading
+        for mod in modalities:
+            coll = self._collections[mod]
+            if not coll.is_uniform:
+                raise ValueError(
+                    f"Collection '{mod}' has heterogeneous shapes. "
+                    f"Call collection.resample_to() to normalize dimensions before ML training."
+                )
+        first_coll = self._collections[modalities[0]]
+        self._n_volumes = len(first_coll)
+        self._volume_shape = first_coll.shape  # Guaranteed non-None after uniform check
+        if len(modalities) > 1:
+            self._validate_subject_alignment()
+        self._labels: dict[int, int | float] | None = None
+        if config.label_column:
+            self._load_labels(radi_object, config.label_column, config.value_filter)
+        if config.loading_mode == LoadingMode.PATCH:
+            self._length = self._n_volumes * config.patches_per_volume
+        elif config.loading_mode == LoadingMode.SLICE_2D:
+            self._length = self._n_volumes * self._volume_shape[2]
+        else:
+            self._length = self._n_volumes
+    def _validate_subject_alignment(self) -> None:
+        """Validate that all modalities have matching subjects."""
+        first_mod = self._modalities[0]
+        first_coll = self._collections[first_mod]
+        first_subjects = set(first_coll.obs_subject_ids)
+        for mod in self._modalities[1:]:
+            coll = self._collections[mod]
+            if len(coll) != self._n_volumes:
+                raise ValueError(
+                    f"Modality '{mod}' has {len(coll)} volumes, expected {self._n_volumes}"
+                )
+            mod_subjects = set(coll.obs_subject_ids)
+            if mod_subjects != first_subjects:
+                missing = first_subjects - mod_subjects
+                extra = mod_subjects - first_subjects
+                raise ValueError(
+                    f"Subject mismatch for modality '{mod}': "
+                    f"missing={list(missing)[:3]}, extra={list(extra)[:3]}"
+                )
+    def _load_labels(
+        self,
+        radi_object: RadiObject,
+        label_column: str,
+        value_filter: str | None,
+    ) -> None:
+        """Load labels from obs_meta dataframe."""
+        obs_meta = radi_object.obs_meta.read(value_filter=value_filter)
+        if label_column not in obs_meta.columns:
+            raise ValueError(f"Label column '{label_column}' not found in obs_meta")
+        first_coll = self._collections[self._modalities[0]]
+        obs_ids = first_coll.obs_ids
+        self._labels = {}
+        for idx in range(self._n_volumes):
+            obs_id = obs_ids[idx]
+            # Try matching by obs_id first (exact match)
+            match = obs_meta[obs_meta["obs_id"] == obs_id]
+            if len(match) == 0:
+                # Fall back to obs_subject_id matching
+                match = obs_meta[obs_meta["obs_subject_id"] == obs_id]
+            if len(match) == 0:
+                # Legacy: try parsing obs_id as subject_id + suffix
+                parts = obs_id.rsplit("_", 1)
+                if len(parts) > 1:
+                    subject_id = parts[0]
+                    match = obs_meta[obs_meta["obs_subject_id"] == subject_id]
+            if len(match) > 0:
+                self._labels[idx] = match[label_column].iloc[0]
+    def __len__(self) -> int:
+        return self._length
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        if self._config.loading_mode == LoadingMode.PATCH:
+            return self._get_patch_item(idx)
+        elif self._config.loading_mode == LoadingMode.SLICE_2D:
+            return self._get_slice_item(idx)
+        else:
+            return self._get_full_volume_item(idx)
+    def _get_full_volume_item(self, idx: int) -> dict[str, torch.Tensor]:
+        """Load full volume for all modalities."""
+        volumes = [self._collections[mod].iloc[idx].to_numpy() for mod in self._modalities]
+        stacked = np.stack(volumes, axis=0)
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(stacked),
+            "idx": idx,
+        }
+        if self._labels is not None and idx in self._labels:
+            label = self._labels[idx]
+            if isinstance(label, (int, float, np.integer, np.floating)):
+                result["label"] = torch.tensor(label)
+            else:
+                result["label"] = label
+        if self._transform is not None:
+            result = self._transform(result)
+        return result
+    def _get_patch_item(self, idx: int) -> dict[str, torch.Tensor]:
+        """Load a random patch from the volume."""
+        volume_idx = idx // self._config.patches_per_volume
+        patch_idx = idx % self._config.patches_per_volume
+        rng = np.random.default_rng(seed=idx)
+        patch_size = self._config.patch_size
+        assert patch_size is not None
+        max_start = tuple(max(0, self._volume_shape[i] - patch_size[i]) for i in range(3))
+        start = tuple(
+            rng.integers(0, max_start[i] + 1) if max_start[i] > 0 else 0 for i in range(3)
+        )
+        volumes = []
+        for mod in self._modalities:
+            vol = self._collections[mod].iloc[volume_idx]
+            patch = vol.slice(
+                slice(start[0], start[0] + patch_size[0]),
+                slice(start[1], start[1] + patch_size[1]),
+                slice(start[2], start[2] + patch_size[2]),
+            )
+            volumes.append(patch)
+        stacked = np.stack(volumes, axis=0)
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(stacked),
+            "idx": volume_idx,
+            "patch_idx": patch_idx,
+            "patch_start": start,
+        }
+        if self._labels is not None and volume_idx in self._labels:
+            label = self._labels[volume_idx]
+            if isinstance(label, (int, float, np.integer, np.floating)):
+                result["label"] = torch.tensor(label)
+            else:
+                result["label"] = label
+        if self._transform is not None:
+            result = self._transform(result)
+        return result
+    def _get_slice_item(self, idx: int) -> dict[str, torch.Tensor]:
+        """Load a 2D slice from the volume."""
+        volume_idx = idx // self._volume_shape[2]
+        slice_idx = idx % self._volume_shape[2]
+        slices = [
+            self._collections[mod].iloc[volume_idx].axial(slice_idx) for mod in self._modalities
+        ]
+        stacked = np.stack(slices, axis=0)
+        result: dict[str, Any] = {
+            "image": torch.from_numpy(stacked),
+            "idx": volume_idx,
+            "slice_idx": slice_idx,
+        }
+        if self._labels is not None and volume_idx in self._labels:
+            label = self._labels[volume_idx]
+            if isinstance(label, (int, float, np.integer, np.floating)):
+                result["label"] = torch.tensor(label)
+            else:
+                result["label"] = label
+        if self._transform is not None:
+            result = self._transform(result)
+        return result
+    @property
+    def modalities(self) -> list[str]:
+        """List of modalities being loaded."""
+        return self._modalities
+    @property
+    def volume_shape(self) -> tuple[int, int, int]:
+        """Shape of each volume (X, Y, Z)."""
+        return self._volume_shape

radiobject/ml/distributed.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Distributed training utilities for DDP."""
+from __future__ import annotations
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Sequence
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from radiobject._types import LabelSource
+from radiobject.ml.config import DatasetConfig, LoadingMode
+from radiobject.ml.datasets.collection_dataset import VolumeCollectionDataset
+from radiobject.ml.utils.worker_init import worker_init_fn
+if TYPE_CHECKING:
+    from radiobject.volume_collection import VolumeCollection
+def create_distributed_dataloader(
+    collections: VolumeCollection | Sequence[VolumeCollection],
+    rank: int,
+    world_size: int,
+    labels: LabelSource = None,
+    batch_size: int = 4,
+    patch_size: tuple[int, int, int] | None = None,
+    num_workers: int = 4,
+    pin_memory: bool = True,
+    transform: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+) -> DataLoader:
+    """Create a DataLoader for distributed training with DDP.
+    Args:
+        collections: Single VolumeCollection or list for multi-modal training.
+        rank: Current process rank.
+        world_size: Total number of processes.
+        labels: Label source (see create_training_dataloader for options).
+        batch_size: Samples per batch per GPU.
+        patch_size: If provided, extract random patches.
+        num_workers: DataLoader worker processes.
+        pin_memory: Pin tensors to CUDA memory.
+        transform: Transform function.
+    Returns:
+        DataLoader with DistributedSampler configured.
+    """
+    loading_mode = LoadingMode.PATCH if patch_size else LoadingMode.FULL_VOLUME
+    config = DatasetConfig(
+        loading_mode=loading_mode,
+        patch_size=patch_size,
+        patches_per_volume=1,
+    )
+    dataset = VolumeCollectionDataset(
+        collections, config=config, labels=labels, transform=transform
+    )
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+        shuffle=True,
+    )
+    effective_workers = num_workers if num_workers > 0 else 0
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=effective_workers,
+        pin_memory=pin_memory and effective_workers > 0,
+        worker_init_fn=worker_init_fn if effective_workers > 0 else None,
+        drop_last=True,
+    )
+def set_epoch(dataloader: DataLoader, epoch: int) -> None:
+    """Set epoch for DistributedSampler to ensure proper shuffling."""
+    if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, DistributedSampler):
+        dataloader.sampler.set_epoch(epoch)