PyPI - dataeval - Versions diffs - 0.86.9__tar.gz → 0.87.0__tar.gz - Mend

dataeval 0.86.9tar.gz → 0.87.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

{dataeval-0.86.9 → dataeval-0.87.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataeval
-Version: 0.86.9
+Version: 0.87.0
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Project-URL: Homepage, https://dataeval.ai/
 Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -8,7 +8,7 @@ Project-URL: Documentation, https://dataeval.readthedocs.io/
 Author-email: Andrew Weng <andrew.weng@ariacoustics.com>, Bill Peria <bill.peria@ariacoustics.com>, Jon Botts <jonathan.botts@ariacoustics.com>, Jonathan Christian <jonathan.christian@ariacoustics.com>, Justin McMillan <justin.mcmillan@ariacoustics.com>, Ryan Wood <ryan.wood@ariacoustics.com>, Scott Swan <scott.swan@ariacoustics.com>, Shaun Jullens <shaun.jullens@ariacoustics.com>
 Maintainer-email: ARiA <dataeval@ariacoustics.com>
 License-Expression: MIT
-License-File: LICENSE.txt
+License-File: LICENSE
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
@@ -20,15 +20,12 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: <3.13,>=3.9
-Requires-Dist: defusedxml>=0.7.1
 Requires-Dist: fast-hdbscan==0.2.0
 Requires-Dist: lightgbm>=4
 Requires-Dist: numba>=0.59.1
 Requires-Dist: numpy>=1.24.2
 Requires-Dist: pandas>=2.0
-Requires-Dist: pillow>=10.3.0
 Requires-Dist: polars>=1.0.0
-Requires-Dist: requests>=2.32.3
 Requires-Dist: scikit-learn>=1.5.0
 Requires-Dist: scipy>=1.10
 Requires-Dist: torch>=2.2.0
@@ -123,14 +120,8 @@ micromamba create -f environment\environment.yaml -c pytorch
 ### **Installing from GitHub**
-To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
-download larger, binary source files.
-```bash
-sudo apt-get install git-lfs
-```
-Pull the source down and change to the DataEval project directory.
+To install DataEval from source locally on Ubuntu, pull the source down and
+change to the DataEval project directory.
 ```bash
 git clone https://github.com/aria-ml/dataeval.git
@@ -167,10 +158,7 @@ source .venv/bin/activate
 ## Contact Us
-If you have any questions, feel free to reach out to the people below:
-- **POC**: Scott Swan @scott.swan
-- **DPOC**: Andrew Weng @aweng
+If you have any questions, feel free to reach out to [us](mailto:dataeval@ariacoustics.com)!
 ## Acknowledgement

{dataeval-0.86.9 → dataeval-0.87.0}/README.md RENAMED Viewed

@@ -72,14 +72,8 @@ micromamba create -f environment\environment.yaml -c pytorch
 ### **Installing from GitHub**
-To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
-download larger, binary source files.
-```bash
-sudo apt-get install git-lfs
-```
-Pull the source down and change to the DataEval project directory.
+To install DataEval from source locally on Ubuntu, pull the source down and
+change to the DataEval project directory.
 ```bash
 git clone https://github.com/aria-ml/dataeval.git
@@ -116,10 +110,7 @@ source .venv/bin/activate
 ## Contact Us
-If you have any questions, feel free to reach out to the people below:
-- **POC**: Scott Swan @scott.swan
-- **DPOC**: Andrew Weng @aweng
+If you have any questions, feel free to reach out to [us](mailto:dataeval@ariacoustics.com)!
 ## Acknowledgement

{dataeval-0.86.9 → dataeval-0.87.0}/pyproject.toml RENAMED Viewed

@@ -31,15 +31,12 @@ classifiers = [
   "Topic :: Scientific/Engineering",
 ]
 dependencies = [
-  "defusedxml>=0.7.1",
   "fast_hdbscan==0.2.0",
   "lightgbm>=4",
   "numba>=0.59.1",
   "numpy>=1.24.2",
   "pandas>=2.0",
-  "pillow>=10.3.0",
   "polars>=1.0.0",
-  "requests>=2.32.3",
   "scipy>=1.10",
   "scikit-learn>=1.5.0",
   "torch>=2.2.0",
@@ -96,6 +93,7 @@ docs = [
   "jinja2>=3.1.6",
   "jupyter-client>=8.6.0",
   "jupyter-cache>=1.0",
+  "maite-datasets>=0.0.1",
   "myst-nb>=1.0",
   "sphinx-autoapi>=3.6.0",
   "sphinx-design>=0.6.1",

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ from __future__ import annotations
 try:
     from ._version import __version__
-except ImportError:
+except ImportError:  # pragma: no cover
     __version__ = "unknown"
 # Strongly type for pyright

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/_version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.86.9'
-__version_tuple__ = version_tuple = (0, 86, 9)
+__version__ = version = '0.87.0'
+__version_tuple__ = version_tuple = (0, 87, 0)

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/config.py RENAMED Viewed

@@ -4,19 +4,15 @@ Global configuration settings for DataEval.
 from __future__ import annotations
-__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
+__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"]
-import sys
-from typing import Any, Union
-if sys.version_info >= (3, 10):
-    from typing import TypeAlias
-else:
-    from typing_extensions import TypeAlias
+from typing import Any
 import numpy as np
 import torch
+from dataeval.typing import DeviceLike
 ### GLOBALS ###
 _device: torch.device | None = None
@@ -27,17 +23,6 @@ _seed: int | None = None
 EPSILON = 1e-12
-### TYPES ###
-DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
-"""
-Type alias for types that are acceptable for specifying a torch.device.
-See Also
---------
-`torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
-"""
 ### FUNCS ###

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_metadata.py RENAMED Viewed

@@ -16,18 +16,31 @@ from dataeval.typing import (
     ObjectDetectionTarget,
 )
 from dataeval.utils._array import as_numpy
-from dataeval.utils._bin import bin_data, digitize_data
+from dataeval.utils._bin import bin_data, digitize_data, is_continuous
 from dataeval.utils.data.metadata import merge
 def _binned(name: str) -> str:
-    return f"{name}[]"
+    return f"{name}↕"
+def _digitized(name: str) -> str:
+    return f"{name}#"
 @dataclass
 class FactorInfo:
-    factor_type: Literal["categorical", "continuous", "discrete"] | None = None
-    discretized_col: str | None = None
+    factor_type: Literal["categorical", "continuous", "discrete"]
+    is_binned: bool = False
+    is_digitized: bool = False
+def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
+    if binned and info.is_binned:
+        return _binned(name)
+    if info.is_digitized:
+        return _digitized(name)
+    return name
 class Metadata:
@@ -60,7 +73,7 @@ class Metadata:
         self._class_labels: NDArray[np.intp]
         self._class_names: list[str]
         self._image_indices: NDArray[np.intp]
-        self._factors: dict[str, FactorInfo]
+        self._factors: dict[str, FactorInfo | None]
         self._dropped_factors: dict[str, list[str]]
         self._dataframe: pl.DataFrame
         self._raw: Sequence[Mapping[str, Any]]
@@ -146,14 +159,27 @@ class Metadata:
         return self._dropped_factors
     @property
-    def discretized_data(self) -> NDArray[np.int64]:
-        """Factor data with continuous data discretized."""
+    def digitized_data(self) -> NDArray[np.int64]:
+        """Factor data with digitized categorical data."""
+        if not self.factor_names:
+            return np.array([], dtype=np.int64)
+        self._bin()
+        return (
+            self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
+            .to_numpy()
+            .astype(np.int64)
+        )
+    @property
+    def binned_data(self) -> NDArray[np.int64]:
+        """Factor data with binned continuous data."""
         if not self.factor_names:
             return np.array([], dtype=np.int64)
         self._bin()
         return (
-            self.dataframe.select([info.discretized_col or name for name, info in self.factor_info.items()])
+            self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
             .to_numpy()
             .astype(np.int64)
         )
@@ -168,7 +194,7 @@ class Metadata:
     def factor_info(self) -> Mapping[str, FactorInfo]:
         """Factor types of the metadata."""
         self._bin()
-        return dict(filter(self._filter, self._factors.items()))
+        return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
     @property
     def factor_data(self) -> NDArray[Any]:
@@ -194,7 +220,7 @@ class Metadata:
     @property
     def image_indices(self) -> NDArray[np.intp]:
         """Indices of images as a NumPy array."""
-        self._bin()
+        self._structure()
         return self._image_indices
     @property
@@ -212,7 +238,7 @@ class Metadata:
             columns = self._dataframe.columns
             for col in (col for col in cols or columns if _binned(col) in columns):
                 self._dataframe.drop_in_place(_binned(col))
-                self._factors[col] = FactorInfo()
+                self._factors[col] = None
             self._is_binned = False
     def _structure(self) -> None:
@@ -277,7 +303,7 @@ class Metadata:
         self._class_labels = labels
         self._class_names = list(index2label.values())
         self._image_indices = target_dict["image_index"]
-        self._factors = dict.fromkeys(factor_dict, FactorInfo())
+        self._factors = dict.fromkeys(factor_dict, None)
         self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
         self._dropped_factors = merged[1]
         self._is_structured = True
@@ -303,24 +329,25 @@ class Metadata:
             )
         column_set = set(df.columns)
-        for col in (col for col in self.factor_names if _binned(col) not in column_set):
+        for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
             # Get data as numpy array for processing
             data = df[col].to_numpy()
-            col_dz = _binned(col)
             if col in factor_bins:
                 # User provided binning
                 bins = factor_bins[col]
-                df = df.with_columns(pl.Series(name=col_dz, values=digitize_data(data, bins).astype(np.int64)))
-                factor_info[col] = FactorInfo("continuous", col_dz)
+                col_bn = _binned(col)
+                df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
+                factor_info[col] = FactorInfo("continuous", is_binned=True)
             else:
                 # Check if data is numeric
-                unique, ordinal = np.unique(data, return_inverse=True)
-                if not np.issubdtype(data.dtype, np.number) or unique.size <= max(20, data.size * 0.01):
-                    # Non-numeric data or small number of unique values - convert to categorical
-                    df = df.with_columns(pl.Series(name=col_dz, values=ordinal.astype(np.int64)))
-                    factor_info[col] = FactorInfo("categorical", col_dz)
-                elif data.dtype == float:
-                    # Many unique values - discretize by binning
+                _, ordinal = np.unique(data, return_inverse=True)
+                if not np.issubdtype(data.dtype, np.number):
+                    # Non-numeric data - convert to categorical
+                    col_dg = _digitized(col)
+                    df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
+                    factor_info[col] = FactorInfo("categorical", is_digitized=True)
+                elif is_continuous(data, self.image_indices):
+                    # Continuous values - discretize by binning
                     warnings.warn(
                         f"A user defined binning was not provided for {col}. "
                         f"Using the {self.auto_bin_method} method to discretize the data. "
@@ -330,10 +357,12 @@ class Metadata:
                     )
                     # Create binned version
                     binned_data = bin_data(data, self.auto_bin_method)
-                    df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
-                    factor_info[col] = FactorInfo("continuous", col_dz)
+                    col_bn = _binned(col)
+                    df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
+                    factor_info[col] = FactorInfo("continuous", is_binned=True)
                 else:
-                    factor_info[col] = FactorInfo("discrete", col)
+                    # Non-continuous values - treat as discrete
+                    factor_info[col] = FactorInfo("discrete")
         # Store the results
         self._dataframe = df
@@ -367,7 +396,7 @@ class Metadata:
         for k, v in factors.items():
             data = as_numpy(v)[self.image_indices]
             new_columns.append(pl.Series(name=k, values=data))
-            self._factors[k] = FactorInfo()
+            self._factors[k] = None
         if new_columns:
             self._dataframe = self.dataframe.with_columns(new_columns)

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/_split.py RENAMED Viewed

@@ -208,7 +208,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
     split_set = set(split_on)
     indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
-    binned_features = metadata.discretized_data[:, indices]
+    binned_features = metadata.binned_data[:, indices]
     return np.unique(binned_features, axis=0, return_inverse=True)[1]

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_classbalance.py RENAMED Viewed

@@ -11,12 +11,13 @@ from dataeval.utils._array import as_numpy
 class ClassBalance(Selection[ImageClassificationDatum]):
     """
-    Balance the dataset by class.
+    Select indices of a dataset that will equalize the occurrences of all classes.
     Note
     ----
-    The total number of instances of each class will be equalized which may result
+    1. The total number of instances of each class will be equalized which may result
     in a lower total number of instances than specified by the selection limit.
+    2. This selection currently only supports classification tasks
     """
     stage = SelectionStage.FILTER
@@ -29,7 +30,7 @@ class ClassBalance(Selection[ImageClassificationDatum]):
                 label = int(np.argmax(as_numpy(target)))
             else:
                 # ObjectDetectionTarget and SegmentationTarget not supported yet
-                raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
+                raise TypeError("ClassBalance only supports classification targets as an array of class probabilities.")
             class_indices.setdefault(label, []).append(i)
         per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_classfilter.py RENAMED Viewed

@@ -14,12 +14,12 @@ from dataeval.utils._array import as_numpy
 class ClassFilter(Selection[Any]):
     """
-    Filter the dataset by class.
+    Select dataset indices based on class labels, keeping only those present in `classes`.
     Parameters
     ----------
     classes : Sequence[int]
-        The classes to filter by.
+        The sequence of classes to keep.
     filter_detections : bool, default True
         Whether to filter detections from targets for object detection and segmentation datasets.
     """
@@ -41,16 +41,16 @@ class ClassFilter(Selection[Any]):
             if isinstance(target, Array):
                 # Get the label for the image
                 label = int(np.argmax(as_numpy(target)))
-                # Check to see if the label is in the classes to filter for
+                # Check to see if the label is in the classes to keep
                 if label in self.classes:
-                    # Include the image
+                    # Include the image index
                     selection.append(idx)
             elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
                 # Get the set of labels from the target
                 labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
                 # Check to see if any labels are in the classes to filter for
                 if labels.intersection(self.classes):
-                    # Include the image
+                    # Include the image index
                     selection.append(idx)
                     # If we are filtering out other labels and there are other labels, add a subselection filter
                     if self.filter_detections and labels.difference(self.classes):

{dataeval-0.86.9 → dataeval-0.87.0}/src/dataeval/data/selections/_indices.py RENAMED Viewed

@@ -9,12 +9,12 @@ from dataeval.data._selection import Select, Selection, SelectionStage
 class Indices(Selection[Any]):
     """
-    Selects specific indices from the dataset.
+    Selects only the given indices from the dataset.
     Parameters
     ----------
     indices : Sequence[int]
-        The indices to select from the dataset.
+        The specific indices to select.
     """
     stage = SelectionStage.FILTER

dataeval 0.86.9__tar.gz → 0.87.0__tar.gz

dataeval 0.86.9tar.gz → 0.87.0tar.gz