PyPI - valor-lite - Versions diffs - 0.36.6__py3-none-any.whl → 0.37.5__py3-none-any.whl - Mend

valor-lite 0.36.6py3-none-any.whl → 0.37.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

valor_lite/cache/__init__.py +11 -0
valor_lite/cache/compute.py +211 -0
valor_lite/cache/ephemeral.py +302 -0
valor_lite/cache/persistent.py +536 -0
valor_lite/classification/__init__.py +5 -10
valor_lite/classification/annotation.py +4 -0
valor_lite/classification/computation.py +233 -251
valor_lite/classification/evaluator.py +882 -0
valor_lite/classification/loader.py +97 -0
valor_lite/classification/metric.py +141 -4
valor_lite/classification/shared.py +184 -0
valor_lite/classification/utilities.py +221 -118
valor_lite/exceptions.py +5 -0
valor_lite/object_detection/__init__.py +5 -4
valor_lite/object_detection/annotation.py +13 -1
valor_lite/object_detection/computation.py +368 -299
valor_lite/object_detection/evaluator.py +804 -0
valor_lite/object_detection/loader.py +292 -0
valor_lite/object_detection/metric.py +152 -3
valor_lite/object_detection/shared.py +206 -0
valor_lite/object_detection/utilities.py +182 -100
valor_lite/semantic_segmentation/__init__.py +5 -4
valor_lite/semantic_segmentation/annotation.py +7 -0
valor_lite/semantic_segmentation/computation.py +20 -110
valor_lite/semantic_segmentation/evaluator.py +414 -0
valor_lite/semantic_segmentation/loader.py +205 -0
valor_lite/semantic_segmentation/shared.py +149 -0
valor_lite/semantic_segmentation/utilities.py +6 -23
{valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/METADATA +3 -1
valor_lite-0.37.5.dist-info/RECORD +49 -0
{valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/WHEEL +1 -1
valor_lite/classification/manager.py +0 -545
valor_lite/object_detection/manager.py +0 -864
valor_lite/profiling.py +0 -374
valor_lite/semantic_segmentation/benchmark.py +0 -237
valor_lite/semantic_segmentation/manager.py +0 -446
valor_lite-0.36.6.dist-info/RECORD +0 -41
{valor_lite-0.36.6.dist-info → valor_lite-0.37.5.dist-info}/top_level.txt +0 -0

valor_lite/semantic_segmentation/evaluator.py ADDED Viewed

@@ -0,0 +1,414 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+from numpy.typing import NDArray
+from valor_lite.cache import (
+    FileCacheReader,
+    FileCacheWriter,
+    MemoryCacheReader,
+    MemoryCacheWriter,
+)
+from valor_lite.exceptions import EmptyCacheError
+from valor_lite.semantic_segmentation.computation import compute_metrics
+from valor_lite.semantic_segmentation.metric import MetricType
+from valor_lite.semantic_segmentation.shared import (
+    EvaluatorInfo,
+    decode_metadata_fields,
+    encode_metadata_fields,
+    extract_counts,
+    extract_labels,
+    generate_cache_path,
+    generate_metadata_path,
+    generate_schema,
+)
+from valor_lite.semantic_segmentation.utilities import (
+    unpack_precision_recall_iou_into_metric_lists,
+)
+class Builder:
+    def __init__(
+        self,
+        writer: MemoryCacheWriter | FileCacheWriter,
+        metadata_fields: list[tuple[str, str | pa.DataType]] | None = None,
+    ):
+        self._writer = writer
+        self._metadata_fields = metadata_fields
+    @classmethod
+    def in_memory(
+        cls,
+        batch_size: int = 10_000,
+        metadata_fields: list[tuple[str, str | pa.DataType]] | None = None,
+    ):
+        """
+        Create an in-memory evaluator cache.
+        Parameters
+        ----------
+        batch_size : int, default=10_000
+            The target number of rows to buffer before writing to the cache. Defaults to 10_000.
+        metadata_fields : list[tuple[str, str | pa.DataType]], optional
+            Optional metadata field definitions.
+        """
+        # create cache
+        writer = MemoryCacheWriter.create(
+            schema=generate_schema(metadata_fields),
+            batch_size=batch_size,
+        )
+        return cls(
+            writer=writer,
+            metadata_fields=metadata_fields,
+        )
+    @classmethod
+    def persistent(
+        cls,
+        path: str | Path,
+        batch_size: int = 10_000,
+        rows_per_file: int = 100_000,
+        compression: str = "snappy",
+        metadata_fields: list[tuple[str, str | pa.DataType]] | None = None,
+    ):
+        """
+        Create a persistent file-based evaluator cache.
+        Parameters
+        ----------
+        path : str | Path
+            Where to store the file-based cache.
+        batch_size : int, default=10_000
+            The target number of rows to buffer before writing to the cache. Defaults to 10_000.
+        rows_per_file : int, default=100_000
+            The target number of rows to store per cache file. Defaults to 100_000.
+        compression : str, default="snappy"
+            The compression methods used when writing cache files.
+        metadata_fields : list[tuple[str, str | pa.DataType]], optional
+            Optional metadata field definitions.
+        """
+        path = Path(path)
+        # create cache
+        writer = FileCacheWriter.create(
+            path=generate_cache_path(path),
+            schema=generate_schema(metadata_fields),
+            batch_size=batch_size,
+            rows_per_file=rows_per_file,
+            compression=compression,
+        )
+        # write metadata
+        metadata_path = generate_metadata_path(path)
+        with open(metadata_path, "w") as f:
+            encoded_types = encode_metadata_fields(metadata_fields)
+            json.dump(encoded_types, f, indent=2)
+        return cls(
+            writer=writer,
+            metadata_fields=metadata_fields,
+        )
+    def finalize(
+        self,
+        index_to_label_override: dict[int, str] | None = None,
+    ):
+        """
+        Performs data finalization and some preprocessing steps.
+        Parameters
+        ----------
+        index_to_label_override : dict[int, str], optional
+            Pre-configures label mapping. Used when operating over filtered subsets.
+        Returns
+        -------
+        Evaluator
+            A ready-to-use evaluator object.
+        """
+        self._writer.flush()
+        if self._writer.count_rows() == 0:
+            raise EmptyCacheError()
+        reader = self._writer.to_reader()
+        # extract labels
+        index_to_label = extract_labels(
+            reader=reader,
+            index_to_label_override=index_to_label_override,
+        )
+        return Evaluator(
+            reader=reader,
+            index_to_label=index_to_label,
+            metadata_fields=self._metadata_fields,
+        )
+class Evaluator:
+    def __init__(
+        self,
+        reader: MemoryCacheReader | FileCacheReader,
+        index_to_label: dict[int, str],
+        metadata_fields: list[tuple[str, str | pa.DataType]] | None = None,
+    ):
+        self._reader = reader
+        self._index_to_label = index_to_label
+        self._metadata_fields = metadata_fields
+    @property
+    def info(self) -> EvaluatorInfo:
+        return self.get_info()
+    def get_info(
+        self,
+        datums: pc.Expression | None = None,
+        groundtruths: pc.Expression | None = None,
+        predictions: pc.Expression | None = None,
+    ) -> EvaluatorInfo:
+        info = EvaluatorInfo()
+        info.number_of_rows = self._reader.count_rows()
+        info.number_of_labels = len(self._index_to_label)
+        info.metadata_fields = self._metadata_fields
+        (
+            info.number_of_datums,
+            info.number_of_pixels,
+            info.number_of_groundtruth_pixels,
+            info.number_of_prediction_pixels,
+        ) = extract_counts(
+            reader=self._reader,
+            datums=datums,
+            groundtruths=groundtruths,
+            predictions=predictions,
+        )
+        return info
+    @classmethod
+    def load(
+        cls,
+        path: str | Path,
+        index_to_label_override: dict[int, str] | None = None,
+    ):
+        """
+        Load from an existing semantic segmentation cache.
+        Parameters
+        ----------
+        path : str | Path
+            Path to the existing cache.
+        index_to_label_override : dict[int, str], optional
+            Option to preset index to label dictionary. Used when loading from filtered caches.
+        """
+        # validate path
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"Directory does not exist: {path}")
+        elif not path.is_dir():
+            raise NotADirectoryError(
+                f"Path exists but is not a directory: {path}"
+            )
+        # load cache
+        reader = FileCacheReader.load(generate_cache_path(path))
+        # extract labels
+        index_to_label = extract_labels(
+            reader=reader,
+            index_to_label_override=index_to_label_override,
+        )
+        # read config
+        metadata_path = generate_metadata_path(path)
+        metadata_fields = None
+        with open(metadata_path, "r") as f:
+            metadata_types = json.load(f)
+            metadata_fields = decode_metadata_fields(metadata_types)
+        return cls(
+            reader=reader,
+            index_to_label=index_to_label,
+            metadata_fields=metadata_fields,
+        )
+    def filter(
+        self,
+        datums: pc.Expression | None = None,
+        groundtruths: pc.Expression | None = None,
+        predictions: pc.Expression | None = None,
+        path: str | Path | None = None,
+    ) -> Evaluator:
+        """
+        Filter evaluator cache.
+        Parameters
+        ----------
+        datums : pc.Expression | None = None
+            A filter expression used to filter datums.
+        groundtruths : pc.Expression | None = None
+            A filter expression used to filter ground truth annotations.
+        predictions : pc.Expression | None = None
+            A filter expression used to filter predictions.
+        path : str | Path, optional
+            Where to store the filtered cache if storing on disk.
+        Returns
+        -------
+        Evaluator
+            A new evaluator object containing the filtered cache.
+        """
+        if isinstance(self._reader, FileCacheReader):
+            if not path:
+                raise ValueError(
+                    "expected path to be defined for file-based cache"
+                )
+            builder = Builder.persistent(
+                path=path,
+                batch_size=self._reader.batch_size,
+                rows_per_file=self._reader.rows_per_file,
+                compression=self._reader.compression,
+                metadata_fields=self.info.metadata_fields,
+            )
+        else:
+            builder = Builder.in_memory(
+                batch_size=self._reader.batch_size,
+                metadata_fields=self.info.metadata_fields,
+            )
+        for tbl in self._reader.iterate_tables(filter=datums):
+            columns = (
+                "datum_id",
+                "gt_label_id",
+                "pd_label_id",
+            )
+            pairs = np.column_stack([tbl[col].to_numpy() for col in columns])
+            n_pairs = pairs.shape[0]
+            gt_ids = pairs[:, (0, 1)].astype(np.int64)
+            pd_ids = pairs[:, (0, 2)].astype(np.int64)
+            if groundtruths is not None:
+                mask_valid_gt = np.zeros(n_pairs, dtype=np.bool_)
+                gt_tbl = tbl.filter(groundtruths)
+                gt_pairs = np.column_stack(
+                    [
+                        gt_tbl[col].to_numpy()
+                        for col in ("datum_id", "gt_label_id")
+                    ]
+                ).astype(np.int64)
+                for gt in np.unique(gt_pairs, axis=0):
+                    mask_valid_gt |= (gt_ids == gt).all(axis=1)
+            else:
+                mask_valid_gt = np.ones(n_pairs, dtype=np.bool_)
+            if predictions is not None:
+                mask_valid_pd = np.zeros(n_pairs, dtype=np.bool_)
+                pd_tbl = tbl.filter(predictions)
+                pd_pairs = np.column_stack(
+                    [
+                        pd_tbl[col].to_numpy()
+                        for col in ("datum_id", "pd_label_id")
+                    ]
+                ).astype(np.int64)
+                for pd in np.unique(pd_pairs, axis=0):
+                    mask_valid_pd |= (pd_ids == pd).all(axis=1)
+            else:
+                mask_valid_pd = np.ones(n_pairs, dtype=np.bool_)
+            mask_valid = mask_valid_gt | mask_valid_pd
+            mask_valid_gt &= mask_valid
+            mask_valid_pd &= mask_valid
+            pairs[~mask_valid_gt, 1] = -1
+            pairs[~mask_valid_pd, 2] = -1
+            for idx, col in enumerate(columns):
+                tbl = tbl.set_column(
+                    tbl.schema.names.index(col), col, pa.array(pairs[:, idx])
+                )
+            builder._writer.write_table(tbl)
+        return builder.finalize(index_to_label_override=self._index_to_label)
+    def _compute_confusion_matrix_intermediate(
+        self, datums: pc.Expression | None = None
+    ) -> NDArray[np.uint64]:
+        """
+        Performs an evaluation and returns metrics.
+        Parameters
+        ----------
+        datums : pyarrow.compute.Expression, optional
+            Option to filter datums by an expression.
+        Returns
+        -------
+        dict[MetricType, list]
+            A dictionary mapping MetricType enumerations to lists of computed metrics.
+        """
+        n_labels = len(self._index_to_label)
+        confusion_matrix = np.zeros(
+            (n_labels + 1, n_labels + 1), dtype=np.uint64
+        )
+        for tbl in self._reader.iterate_tables(filter=datums):
+            columns = (
+                "datum_id",
+                "gt_label_id",
+                "pd_label_id",
+            )
+            ids = np.column_stack(
+                [tbl[col].to_numpy() for col in columns]
+            ).astype(np.int64)
+            counts = tbl["count"].to_numpy()
+            mask_null_gts = ids[:, 1] == -1
+            mask_null_pds = ids[:, 2] == -1
+            confusion_matrix[0, 0] += counts[
+                mask_null_gts & mask_null_pds
+            ].sum()
+            for idx in range(n_labels):
+                mask_gts = ids[:, 1] == idx
+                for pidx in range(n_labels):
+                    mask_pds = ids[:, 2] == pidx
+                    confusion_matrix[idx + 1, pidx + 1] += counts[
+                        mask_gts & mask_pds
+                    ].sum()
+                mask_unmatched_gts = mask_gts & mask_null_pds
+                confusion_matrix[idx + 1, 0] += counts[
+                    mask_unmatched_gts
+                ].sum()
+                mask_unmatched_pds = mask_null_gts & (ids[:, 2] == idx)
+                confusion_matrix[0, idx + 1] += counts[
+                    mask_unmatched_pds
+                ].sum()
+        return confusion_matrix
+    def compute_precision_recall_iou(
+        self, datums: pc.Expression | None = None
+    ) -> dict[MetricType, list]:
+        """
+        Performs an evaluation and returns metrics.
+        Parameters
+        ----------
+        datums : pyarrow.compute.Expression, optional
+            Option to filter datums by an expression.
+        Returns
+        -------
+        dict[MetricType, list]
+            A dictionary mapping MetricType enumerations to lists of computed metrics.
+        """
+        confusion_matrix = self._compute_confusion_matrix_intermediate(
+            datums=datums
+        )
+        results = compute_metrics(confusion_matrix=confusion_matrix)
+        return unpack_precision_recall_iou_into_metric_lists(
+            results=results,
+            index_to_label=self._index_to_label,
+        )

valor_lite/semantic_segmentation/loader.py ADDED Viewed

@@ -0,0 +1,205 @@
+import numpy as np
+import pyarrow as pa
+from tqdm import tqdm
+from valor_lite.cache import FileCacheWriter, MemoryCacheWriter
+from valor_lite.semantic_segmentation.annotation import Segmentation
+from valor_lite.semantic_segmentation.computation import compute_intermediates
+from valor_lite.semantic_segmentation.evaluator import Builder
+class Loader(Builder):
+    def __init__(
+        self,
+        writer: MemoryCacheWriter | FileCacheWriter,
+        metadata_fields: list[tuple[str, str | pa.DataType]] | None = None,
+    ):
+        super().__init__(
+            writer=writer,
+            metadata_fields=metadata_fields,
+        )
+        # internal state
+        self._labels: dict[str, int] = {}
+        self._index_to_label: dict[int, str] = {}
+        self._datum_count = 0
+    def _add_label(self, value: str) -> int:
+        idx = self._labels.get(value, None)
+        if idx is None:
+            idx = len(self._labels)
+            self._labels[value] = idx
+            self._index_to_label[idx] = value
+        return idx
+    def add_data(
+        self,
+        segmentations: list[Segmentation],
+        show_progress: bool = False,
+    ):
+        """
+        Adds segmentations to the cache.
+        Parameters
+        ----------
+        segmentations : list[Segmentation]
+            A list of Segmentation objects.
+        show_progress : bool, default=False
+            Toggle for tqdm progress bar.
+        """
+        disable_tqdm = not show_progress
+        for segmentation in tqdm(segmentations, disable=disable_tqdm):
+            groundtruth_labels = -1 * np.ones(
+                len(segmentation.groundtruths), dtype=np.int64
+            )
+            for idx, groundtruth in enumerate(segmentation.groundtruths):
+                label_idx = self._add_label(groundtruth.label)
+                groundtruth_labels[idx] = label_idx
+            prediction_labels = -1 * np.ones(
+                len(segmentation.predictions), dtype=np.int64
+            )
+            for idx, prediction in enumerate(segmentation.predictions):
+                label_idx = self._add_label(prediction.label)
+                prediction_labels[idx] = label_idx
+            if segmentation.groundtruths:
+                combined_groundtruths = np.stack(
+                    [
+                        groundtruth.mask.flatten()
+                        for groundtruth in segmentation.groundtruths
+                    ],
+                    axis=0,
+                )
+            else:
+                combined_groundtruths = np.zeros(
+                    (1, segmentation.shape[0] * segmentation.shape[1]),
+                    dtype=np.bool_,
+                )
+            if segmentation.predictions:
+                combined_predictions = np.stack(
+                    [
+                        prediction.mask.flatten()
+                        for prediction in segmentation.predictions
+                    ],
+                    axis=0,
+                )
+            else:
+                combined_predictions = np.zeros(
+                    (1, segmentation.shape[0] * segmentation.shape[1]),
+                    dtype=np.bool_,
+                )
+            n_labels = len(self._labels)
+            counts = compute_intermediates(
+                groundtruths=combined_groundtruths,
+                predictions=combined_predictions,
+                groundtruth_labels=groundtruth_labels,
+                prediction_labels=prediction_labels,
+                n_labels=n_labels,
+            )
+            # prepare metadata
+            datum_metadata = (
+                segmentation.metadata if segmentation.metadata else {}
+            )
+            gt_metadata = {
+                self._labels[gt.label]: gt.metadata
+                for gt in segmentation.groundtruths
+                if gt.metadata
+            }
+            pd_metadata = {
+                self._labels[pd.label]: pd.metadata
+                for pd in segmentation.predictions
+                if pd.metadata
+            }
+            # cache formatting
+            rows = []
+            for idx in range(n_labels):
+                label = self._index_to_label[idx]
+                for pidx in range(n_labels):
+                    # write non-zero intersections to cache
+                    if counts[idx + 1, pidx + 1] > 0:
+                        plabel = self._index_to_label[pidx]
+                        rows.append(
+                            {
+                                # metadata
+                                **datum_metadata,
+                                **gt_metadata.get(idx, {}),
+                                **pd_metadata.get(pidx, {}),
+                                # datum
+                                "datum_uid": segmentation.uid,
+                                "datum_id": self._datum_count,
+                                # groundtruth
+                                "gt_label": label,
+                                "gt_label_id": idx,
+                                # prediction
+                                "pd_label": plabel,
+                                "pd_label_id": pidx,
+                                # pair
+                                "count": counts[idx + 1, pidx + 1],
+                            }
+                        )
+                # write all unmatched to preserve labels
+                rows.extend(
+                    [
+                        {
+                            # metadata
+                            **datum_metadata,
+                            **gt_metadata.get(idx, {}),
+                            # datum
+                            "datum_uid": segmentation.uid,
+                            "datum_id": self._datum_count,
+                            # groundtruth
+                            "gt_label": label,
+                            "gt_label_id": idx,
+                            # prediction
+                            "pd_label": None,
+                            "pd_label_id": -1,
+                            # pair
+                            "count": counts[idx + 1, 0],
+                        },
+                        {
+                            # metadata
+                            **datum_metadata,
+                            **gt_metadata.get(idx, {}),
+                            **pd_metadata.get(idx, {}),
+                            # datum
+                            "datum_uid": segmentation.uid,
+                            "datum_id": self._datum_count,
+                            # groundtruth
+                            "gt_label": None,
+                            "gt_label_id": -1,
+                            # prediction
+                            "pd_label": label,
+                            "pd_label_id": idx,
+                            # pair
+                            "count": counts[0, idx + 1],
+                        },
+                    ]
+                )
+            rows.append(
+                {
+                    # metadata
+                    **datum_metadata,
+                    # datum
+                    "datum_uid": segmentation.uid,
+                    "datum_id": self._datum_count,
+                    # groundtruth
+                    "gt_label": None,
+                    "gt_label_id": -1,
+                    # prediction
+                    "pd_label": None,
+                    "pd_label_id": -1,
+                    # pair
+                    "count": counts[0, 0],
+                }
+            )
+            self._writer.write_rows(rows)
+            # update datum count
+            self._datum_count += 1

valor-lite 0.36.6__py3-none-any.whl → 0.37.5__py3-none-any.whl

valor-lite 0.36.6py3-none-any.whl → 0.37.5py3-none-any.whl