PyPI - dataeval - Versions diffs - 1.0.0rc2__tar.gz → 1.0.0rc3__tar.gz - Mend

dataeval 1.0.0rc2tar.gz → 1.0.0rc3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataeval
-Version: 1.0.0rc2
+Version: 1.0.0rc3
 Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
 Project-URL: Homepage, https://dataeval.ai/
 Project-URL: Repository, https://github.com/aria-ml/dataeval/

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_metadata.py RENAMED Viewed

@@ -1250,6 +1250,7 @@ class Metadata(Array, FeatureExtractor):
         scores = []
         srcidx = []
         datum_count = len(self._dataset)
+        _logger.info("Processing metadata for %d dataset items", datum_count)
         self._has_targets = self._process_targets(raw, labels, bboxes, scores, srcidx, datum_count, progress_callback)
@@ -1259,6 +1260,21 @@ class Metadata(Array, FeatureExtractor):
         bboxes = np_asarray(bboxes, dtype=np.float32) if self._has_targets else None
         srcidx = np.asarray(srcidx, dtype=np.intp)
+        n_classes = len(np.unique(labels)) if len(labels) else 0
+        if self._has_targets:
+            _logger.info(
+                "Object Detection dataset: %d images, %d classes, %d detections",
+                datum_count,
+                n_classes,
+                len(labels),
+            )
+        else:
+            _logger.info(
+                "Image Classification dataset: %d images, %d classes",
+                datum_count,
+                n_classes,
+            )
         index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
         target_idx = self._compute_target_indices(srcidx, datum_count, bool(self._has_targets))
         reserved = ["image_index", "target_index", "class_label", "score", "box"]
@@ -1314,6 +1330,12 @@ class Metadata(Array, FeatureExtractor):
         self._dataframe = pl.DataFrame(combined_rows)
         self._is_structured = True
+        _logger.debug(
+            "Metadata structured: %d image factors, %d target factors, %d dropped",
+            len(self._image_factors),
+            len(self._target_factors),
+            sum(len(v) for v in self._dropped_factors.values()),
+        )
         # Build _factors dict from stored factor dictionaries
         self._build_factors()

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '1.0.0rc2'
-__version_tuple__ = version_tuple = (1, 0, 0, 'rc2')
+__version__ = version = '1.0.0rc3'
+__version_tuple__ = version_tuple = (1, 0, 0, 'rc3')
 __commit_id__ = commit_id = None

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculate.py RENAMED Viewed

@@ -167,27 +167,32 @@ def _collect_calculator_stats(
     datum: NDArray[Any],
     box: BoundingBox | None,
     per_channel: bool,
-) -> tuple[list[dict[str, list[Any]]], dict[str, Any]]:
+) -> tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]:
     """
     Collect stats from all calculators.
     Returns
     -------
-    tuple[list[dict[str, list[Any]]], dict[str, Any]]
-        A tuple of (stats_list, empty_values_map) where:
+    tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]
+        A tuple of (stats_list, empty_values_map, warnings) where:
         - stats_list: List of computed stats from each calculator
         - empty_values_map: Mapping of stat names to their empty values (defaults to np.nan)
+        - warnings: List of warning messages from calculators
     """
     stats_list = []
     empty_values_map: dict[str, Any] = {}
+    warnings: list[str] = []
     processor = CalculatorCache(datum, box, per_channel)
     for calculator_cls, flags in calculators:
         calculator = calculator_cls(datum, processor, per_channel)
         stats_list.append(calculator.compute(flags))
         # Collect empty values from this calculator
         empty_values_map.update(calculator.get_empty_values())
+        # Collect warnings from this calculator
+        if hasattr(calculator, "warnings"):
+            warnings.extend(calculator.warnings)
         del calculator
-    return stats_list, empty_values_map
+    return stats_list, empty_values_map, warnings
 def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], num_channels: int) -> list[int | None]:
@@ -303,10 +308,18 @@ def _calculate_datum(
             box_count += 1
             if not box.is_clippable():
                 invalid_box_count += 1
-                warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for datum shape {datum.shape} is invalid.")
+                source = f"{i}[{i_b}]"
+                warnings_list.append(f"{source}: Bounding box {box} for datum shape {datum.shape} is invalid")
         # Collect stats from all calculators
-        calculator_stats, empty_values_map = _collect_calculator_stats(calculators, datum, box, per_channel)
+        calculator_stats, empty_values_map, calc_warnings = _collect_calculator_stats(
+            calculators, datum, box, per_channel
+        )
+        # Thread calculator warnings with index context
+        for w in calc_warnings:
+            source = f"{i}" if box is None else f"{i}[{i_b}]"
+            warnings_list.append(f"{source}: {w}")
         # Determine what channel indices are needed
         sorted_channels = _determine_channel_indices(calculator_stats, num_channels)
@@ -505,8 +518,14 @@ def calculate(
     # Get calculators from registry based on flags
     calculators = CalculatorRegistry.get_calculators(stats)
+    # Log the individual flags that will be computed
+    resolved_names = [
+        f.name for f in type(stats) if f in stats and f.name and f.value and (f.value & (f.value - 1)) == 0
+    ]
     _logger.info(
-        "Starting calculate with per_image=%s, per_target=%s, per_channel=%s",
+        "Starting calculate: %d stats [%s], per_image=%s, per_target=%s, per_channel=%s",
+        len(resolved_names),
+        ", ".join(resolved_names),
         per_image,
         per_target,
         per_channel,
@@ -543,8 +562,13 @@ def calculate(
             if progress_callback:
                 progress_callback(image_count, total=total_images)
+    # Aggregate warnings by message type, collecting indices per type
+    grouped_warnings: dict[str, list[str]] = {}
     for w in warning_list:
-        _logger.warning(w)
+        idx, _, msg = w.partition(": ")
+        grouped_warnings.setdefault(msg, []).append(idx)
+    for msg, indices in grouped_warnings.items():
+        _logger.warning("%s — indices: %s", msg, ", ".join(indices))
     _logger.debug("Sorting %d source indices and %d stats", len(source_indices), len(aggregated_stats))
     sorted_source_indices, sorted_aggregated_stats = _sort(source_indices, aggregated_stats)

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_hashstats.py RENAMED Viewed

@@ -28,35 +28,42 @@ class HashStatCalculator(Calculator):
     def __init__(self, datum: NDArray[Any], cache: "CalculatorCache", per_channel: bool = False) -> None:  # noqa: ARG002
         self.datum = datum
         self.cache = cache
+        self.warnings: list[str] = []
     def get_applicable_flags(self) -> ImageStats:
         """Return which flags this calculator handles."""
         return ImageStats.HASH
-    def _xxhash(self) -> list[str]:
-        from dataeval.core._hash import xxhash
+    def _collect(self, result: tuple[str, str | None]) -> list[str]:
+        hash_value, warning = result
+        if warning:
+            self.warnings.append(warning)
+        return [hash_value]
-        return [xxhash(self.cache.image)]
+    def _compute_xxhash(self) -> list[str]:
+        from dataeval.core._hash import _xxhash
-    def _phash(self) -> list[str]:
-        from dataeval.core._hash import phash
+        return self._collect(_xxhash(self.cache.image))
-        return [phash(self.cache.image)]
+    def _compute_phash(self) -> list[str]:
+        from dataeval.core._hash import _phash
-    def _phash_d4(self) -> list[str]:
-        from dataeval.core._hash import phash_d4
+        return self._collect(_phash(self.cache.image))
-        return [phash_d4(self.cache.image)]
+    def _compute_phash_d4(self) -> list[str]:
+        from dataeval.core._hash import _phash_d4
-    def _dhash(self) -> list[str]:
-        from dataeval.core._hash import dhash
+        return self._collect(_phash_d4(self.cache.image))
-        return [dhash(self.cache.image)]
+    def _compute_dhash(self) -> list[str]:
+        from dataeval.core._hash import _dhash
-    def _dhash_d4(self) -> list[str]:
-        from dataeval.core._hash import dhash_d4
+        return self._collect(_dhash(self.cache.image))
-        return [dhash_d4(self.cache.image)]
+    def _compute_dhash_d4(self) -> list[str]:
+        from dataeval.core._hash import _dhash_d4
+        return self._collect(_dhash_d4(self.cache.image))
     def get_empty_values(self) -> dict[str, Any]:
         """Return empty values for hash statistics."""
@@ -71,9 +78,9 @@ class HashStatCalculator(Calculator):
     def get_handlers(self) -> dict[ImageStats, tuple[str, Callable[[], list[Any]]]]:
         """Return mapping of flags to (stat_name, handler_function)."""
         return {
-            ImageStats.HASH_XXHASH: ("xxhash", self._xxhash),
-            ImageStats.HASH_PHASH: ("phash", self._phash),
-            ImageStats.HASH_DHASH: ("dhash", self._dhash),
-            ImageStats.HASH_PHASH_D4: ("phash_d4", self._phash_d4),
-            ImageStats.HASH_DHASH_D4: ("dhash_d4", self._dhash_d4),
+            ImageStats.HASH_XXHASH: ("xxhash", self._compute_xxhash),
+            ImageStats.HASH_PHASH: ("phash", self._compute_phash),
+            ImageStats.HASH_DHASH: ("dhash", self._compute_dhash),
+            ImageStats.HASH_PHASH_D4: ("phash_d4", self._compute_phash_d4),
+            ImageStats.HASH_DHASH_D4: ("dhash_d4", self._compute_dhash_d4),
         }

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_hash.py RENAMED Viewed

@@ -20,7 +20,7 @@ HASH_SIZE = 8
 MAX_FACTOR = 4
-def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArray[np.uint8] | None:
+def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> tuple[NDArray[np.uint8] | None, str | None]:
     """
     Prepare an image for perceptual hashing by normalizing and converting to grayscale.
@@ -33,65 +33,36 @@ def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArra
     Returns
     -------
-    NDArray[np.uint8] | None
-        Grayscale image ready for hashing, or None if image is unsuitable.
+    tuple[NDArray[np.uint8] | None, str | None]
+        A tuple of (grayscale_image, warning). On success, warning is None.
+        On failure, grayscale_image is None and warning describes the reason.
     """
     image_np = as_numpy(image)
     # Perceptual hashing only works on spatial data (2D or higher)
     if image_np.ndim < 2:
-        _logger.warning("Perceptual hashing requires spatial data (2D or higher dimensions)")
-        return None
+        return None, "Perceptual hashing requires spatial data (2D or higher dimensions)"
     # Verify that the image is at least larger than minimum size
     min_dim = min(image_np.shape[-2:])
     if min_dim < min_size:
-        _logger.warning("Image too small for perceptual hashing: min_dim=%d", min_dim)
-        return None
+        return None, "Image too small for perceptual hashing"
     # Normalize the image shape to CxHxW
     normalized = normalize_image_shape(image_np)
     # Convert to single-channel grayscale image
-    return to_canonical_grayscale(normalized)
+    return to_canonical_grayscale(normalized), None
-def phash(image: Array3D[Any]) -> str:
-    """
-    Compute perceptual hash using Discrete Cosine Transform (DCT).
-    Resizes image to a square NxN using Lanczos algorithm where N is 32x32
-    or the largest multiple of 8 smaller than input dimensions. The resampled
-    image is compressed using DCT and the lowest frequency component is encoded
-    as a bit array of greater or less than median value.
-    Parameters
-    ----------
-    image : Array3D
-        An image in CxHxW format. Can be a 3D list, or array-like object.
-    Returns
-    -------
-    str
-        Hex string hash of the image, or empty string if image is too small
-        or not spatial data.
-    Notes
-    -----
-    DCT-based hashing (pHash) is robust to:
-    - Scaling and resizing
-    - Minor color adjustments
-    - Compression artifacts
-    It captures frequency information, making it effective for detecting
-    images that have been resized or slightly modified.
-    """
+def _phash(image: Array3D[Any]) -> tuple[str, str | None]:
+    """Compute perceptual hash, returning (hash, warning) tuple."""
     image_np = as_numpy(image)
     _logger.debug("Computing perceptual hash for image with shape: %s", image_np.shape)
-    grayscale = _prepare_image(image_np)
+    grayscale, warning = _prepare_image(image_np)
     if grayscale is None:
-        return ""
+        return "", warning
     # Calculates the dimensions of the resized square image
     min_dim = min(image_np.shape[-2:])
@@ -112,43 +83,17 @@ def phash(image: Array3D[Any]) -> str:
     hash_hex = np.packbits(padded).tobytes().hex()
     result = hash_hex if hash_hex else "0"
     _logger.debug("Perceptual hash computed: %s", result[:16] + "..." if len(result) > 16 else result)
-    return result
+    return result, None
-def dhash(image: Array3D[Any]) -> str:
-    """
-    Compute difference hash (dHash) for an image.
-    Resizes then crops image to 9x8 grayscale and computes horizontal gradient
-    by comparing adjacent pixels, producing a 64-bit hash. Captures relative
-    brightness changes rather than absolute values.
-    Parameters
-    ----------
-    image : Array3D
-        An image in CxHxW format. Can be a 3D list, or array-like object.
-    Returns
-    -------
-    str
-        Hex string hash of the image, or empty string if image is too small
-        or not spatial data.
-    Notes
-    -----
-    Difference hash captures gradient information:
-    - Captures structural information via pixel transitions
-    - Complementary to DCT-based pHash (frequency vs gradient domain)
-    The horizontal gradient approach makes it particularly effective for
-    detecting cropped or slightly shifted versions of images.
-    """
+def _dhash(image: Array3D[Any]) -> tuple[str, str | None]:
+    """Compute difference hash, returning (hash, warning) tuple."""
     image_np = as_numpy(image)
     _logger.debug("Computing difference hash for image with shape: %s", image_np.shape)
-    grayscale = _prepare_image(image_np)
+    grayscale, warning = _prepare_image(image_np)
     if grayscale is None:
-        return ""
+        return "", warning
     # Resize to 9x8 (9 wide to get 8 differences)
     im = resize(grayscale, HASH_SIZE + 1)
@@ -162,7 +107,7 @@ def dhash(image: Array3D[Any]) -> str:
     hash_hex = np.packbits(diff.flatten()).tobytes().hex()
     result = hash_hex if hash_hex else "0"
     _logger.debug("Difference hash computed: %s", result)
-    return result
+    return result, None
 def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
@@ -191,43 +136,8 @@ def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
     return transforms
-def phash_d4(image: Array3D[Any]) -> str:
-    """
-    Compute orientation-invariant perceptual hash using DCT.
-    Computes phash for all 8 dihedral group transformations (4 rotations ×
-    2 flip states) and returns the lexicographically smallest hash as the
-    canonical representative.
-    Parameters
-    ----------
-    image : Array3D
-        An image in CxHxW format. Can be a 3D list, or array-like object.
-    Returns
-    -------
-    str
-        Canonical hex string hash invariant to rotation and mirroring,
-        or empty string if image is too small or not spatial data.
-    Notes
-    -----
-    This hash is invariant to:
-    - 90°, 180°, 270° rotations
-    - Horizontal and vertical flips
-    - Any combination of rotation and flip
-    The canonical hash is the lexicographically smallest hash among all
-    8 orientations, ensuring that any orientation of the same image
-    produces the identical hash.
-    Computation cost is ~8x that of regular phash.
-    See Also
-    --------
-    phash : Standard orientation-sensitive perceptual hash
-    dhash_d4 : Orientation-invariant difference hash
-    """
+def _phash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
+    """Compute orientation-invariant perceptual hash, returning (hash, warning) tuple."""
     from scipy.fftpack import dct
     from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
@@ -239,11 +149,11 @@ def phash_d4(image: Array3D[Any]) -> str:
     # Validate input
     if image_np.ndim < 2:
-        return ""
+        return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
     min_dim = min(image_np.shape[-2:])
     if min_dim < hash_size + 1:
-        return ""
+        return "", "Image too small for perceptual hashing"
     # Prepare grayscale image
     normalized = normalize_image_shape(image_np)
@@ -269,14 +179,133 @@ def phash_d4(image: Array3D[Any]) -> str:
         hashes.append(hash_hex if hash_hex else "0")
     # Return canonical (lexicographically smallest) hash
-    return min(hashes)
+    return min(hashes), None
-def dhash_d4(image: Array3D[Any]) -> str:
+def _dhash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
+    """Compute orientation-invariant difference hash, returning (hash, warning) tuple."""
+    from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
+    hash_size = 8
+    image_np = as_numpy(image)
+    # Validate input
+    if image_np.ndim < 2:
+        return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
+    min_dim = min(image_np.shape[-2:])
+    if min_dim < hash_size + 1:
+        return "", "Image too small for perceptual hashing"
+    # Prepare grayscale image
+    normalized = normalize_image_shape(image_np)
+    grayscale = to_canonical_grayscale(normalized)
+    # Compute hash for each D4 transformation
+    hashes: list[str] = []
+    for transformed in _get_d4_transforms(grayscale):
+        # Resize to 9x8 (9 wide to get 8 horizontal differences)
+        im = resize(transformed, hash_size + 1)
+        im = im[:hash_size, : hash_size + 1]
+        # Compute horizontal gradient
+        diff = im[:, :-1] > im[:, 1:]
+        # Convert to hex
+        hash_hex = np.packbits(diff.flatten()).tobytes().hex()
+        hashes.append(hash_hex if hash_hex else "0")
+    # Return canonical (lexicographically smallest) hash
+    return min(hashes), None
+def _xxhash(image: Array3D[Any]) -> tuple[str, str | None]:
+    """Compute xxhash, returning (hash, warning) tuple."""
+    image_np = as_numpy(image)
+    _logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
+    hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
+    _logger.debug("xxhash computed: %s", hash_result)
+    return hash_result, None
+def _log_and_return(result: tuple[str, str | None]) -> str:
+    """Log the warning (if any) and return just the hash string."""
+    hash_value, warning = result
+    if warning:
+        _logger.warning(warning)
+    return hash_value
+def phash(image: Array3D[Any]) -> str:
     """
-    Compute orientation-invariant difference hash using gradients.
+    Compute perceptual hash using Discrete Cosine Transform (DCT).
-    Computes dhash for all 8 dihedral group transformations (4 rotations ×
+    Resizes image to a square NxN using Lanczos algorithm where N is 32x32
+    or the largest multiple of 8 smaller than input dimensions. The resampled
+    image is compressed using DCT and the lowest frequency component is encoded
+    as a bit array of greater or less than median value.
+    Parameters
+    ----------
+    image : Array3D
+        An image in CxHxW format. Can be a 3D list, or array-like object.
+    Returns
+    -------
+    str
+        Hex string hash of the image, or empty string if image is too small
+        or not spatial data.
+    Notes
+    -----
+    DCT-based hashing (pHash) is robust to:
+    - Scaling and resizing
+    - Minor color adjustments
+    - Compression artifacts
+    It captures frequency information, making it effective for detecting
+    images that have been resized or slightly modified.
+    """
+    return _log_and_return(_phash(image))
+def dhash(image: Array3D[Any]) -> str:
+    """
+    Compute difference hash (dHash) for an image.
+    Resizes then crops image to 9x8 grayscale and computes horizontal gradient
+    by comparing adjacent pixels, producing a 64-bit hash. Captures relative
+    brightness changes rather than absolute values.
+    Parameters
+    ----------
+    image : Array3D
+        An image in CxHxW format. Can be a 3D list, or array-like object.
+    Returns
+    -------
+    str
+        Hex string hash of the image, or empty string if image is too small
+        or not spatial data.
+    Notes
+    -----
+    Difference hash captures gradient information:
+    - Captures structural information via pixel transitions
+    - Complementary to DCT-based pHash (frequency vs gradient domain)
+    The horizontal gradient approach makes it particularly effective for
+    detecting cropped or slightly shifted versions of images.
+    """
+    return _log_and_return(_dhash(image))
+def phash_d4(image: Array3D[Any]) -> str:
+    """
+    Compute orientation-invariant perceptual hash using DCT.
+    Computes phash for all 8 dihedral group transformations (4 rotations ×
     2 flip states) and returns the lexicographically smallest hash as the
     canonical representative.
@@ -302,47 +331,54 @@ def dhash_d4(image: Array3D[Any]) -> str:
     8 orientations, ensuring that any orientation of the same image
     produces the identical hash.
-    Computation cost is ~8x that of regular dhash.
+    Computation cost is ~8x that of regular phash.
     See Also
     --------
-    dhash : Standard orientation-sensitive difference hash
-    phash_d4 : Orientation-invariant perceptual hash
+    phash : Standard orientation-sensitive perceptual hash
+    dhash_d4 : Orientation-invariant difference hash
     """
-    from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
+    return _log_and_return(_phash_d4(image))
-    hash_size = 8
-    image_np = as_numpy(image)
+def dhash_d4(image: Array3D[Any]) -> str:
+    """
+    Compute orientation-invariant difference hash using gradients.
-    # Validate input
-    if image_np.ndim < 2:
-        return ""
+    Computes dhash for all 8 dihedral group transformations (4 rotations ×
+    2 flip states) and returns the lexicographically smallest hash as the
+    canonical representative.
-    min_dim = min(image_np.shape[-2:])
-    if min_dim < hash_size + 1:
-        return ""
+    Parameters
+    ----------
+    image : Array3D
+        An image in CxHxW format. Can be a 3D list, or array-like object.
-    # Prepare grayscale image
-    normalized = normalize_image_shape(image_np)
-    grayscale = to_canonical_grayscale(normalized)
+    Returns
+    -------
+    str
+        Canonical hex string hash invariant to rotation and mirroring,
+        or empty string if image is too small or not spatial data.
-    # Compute hash for each D4 transformation
-    hashes: list[str] = []
-    for transformed in _get_d4_transforms(grayscale):
-        # Resize to 9x8 (9 wide to get 8 horizontal differences)
-        im = resize(transformed, hash_size + 1)
-        im = im[:hash_size, : hash_size + 1]
+    Notes
+    -----
+    This hash is invariant to:
+    - 90°, 180°, 270° rotations
+    - Horizontal and vertical flips
+    - Any combination of rotation and flip
-        # Compute horizontal gradient
-        diff = im[:, :-1] > im[:, 1:]
+    The canonical hash is the lexicographically smallest hash among all
+    8 orientations, ensuring that any orientation of the same image
+    produces the identical hash.
-        # Convert to hex
-        hash_hex = np.packbits(diff.flatten()).tobytes().hex()
-        hashes.append(hash_hex if hash_hex else "0")
+    Computation cost is ~8x that of regular dhash.
-    # Return canonical (lexicographically smallest) hash
-    return min(hashes)
+    See Also
+    --------
+    dhash : Standard orientation-sensitive difference hash
+    phash_d4 : Orientation-invariant perceptual hash
+    """
+    return _log_and_return(_dhash_d4(image))
 def xxhash(image: Array3D[Any]) -> str:
@@ -368,11 +404,7 @@ def xxhash(image: Array3D[Any]) -> str:
     hashes, it will produce completely different values for images that
     differ by even a single pixel.
     """
-    image_np = as_numpy(image)
-    _logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
-    hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
-    _logger.debug("xxhash computed: %s", hash_result)
-    return hash_result
+    return _log_and_return(_xxhash(image))
 def hamming_distance(hash1: str, hash2: str) -> int:

{dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_duplicates.py RENAMED Viewed

@@ -2,7 +2,7 @@
 __all__ = []
-from collections.abc import Sequence
+from collections.abc import Iterator, Sequence
 from dataclasses import dataclass
 from typing import Any, Generic, Literal, NamedTuple, TypeVar, overload
@@ -75,6 +75,9 @@ class NearDuplicateGroup(Generic[TIndexType]):
     methods: frozenset[str]
     orientation: Literal["rotated", "same"] | None = None
+    def __iter__(self) -> Iterator[TIndexType]:
+        yield from self.indices
     def __repr__(self) -> str:
         orientation = f", orientation={self.orientation}" if self.orientation else ""
         return f"NearDuplicateGroup({list(self.indices)}, methods={sorted(self.methods)}{orientation})"

dataeval 1.0.0rc2__tar.gz → 1.0.0rc3__tar.gz

dataeval 1.0.0rc2tar.gz → 1.0.0rc3tar.gz