dataeval 1.0.0rc2__tar.gz → 1.0.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/PKG-INFO +1 -1
  2. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_metadata.py +22 -0
  3. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_version.py +2 -2
  4. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculate.py +32 -8
  5. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_hashstats.py +27 -20
  6. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_hash.py +180 -148
  7. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_duplicates.py +4 -1
  8. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/data.py +299 -26
  9. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/.gitignore +0 -0
  10. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/LICENSE +0 -0
  11. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/README.md +0 -0
  12. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/pyproject.toml +0 -0
  13. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/__init__.py +0 -0
  14. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_embeddings.py +0 -0
  15. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_helpers.py +0 -0
  16. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_log.py +0 -0
  17. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_warm_cache.py +0 -0
  18. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/__init__.py +0 -0
  19. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_balance.py +0 -0
  20. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_diversity.py +0 -0
  21. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_parity.py +0 -0
  22. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/config.py +0 -0
  23. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/__init__.py +0 -0
  24. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_ber.py +0 -0
  25. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_bin.py +0 -0
  26. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculate_ratios.py +0 -0
  27. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/__init__.py +0 -0
  28. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_base.py +0 -0
  29. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  30. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
  31. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_register.py +0 -0
  32. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_registry.py +0 -0
  33. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  34. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_clusterer.py +0 -0
  35. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_completeness.py +0 -0
  36. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_coverage.py +0 -0
  37. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_divergence.py +0 -0
  38. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_diversity.py +0 -0
  39. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
  40. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  41. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_mst.py +0 -0
  42. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_feature_distance.py +0 -0
  43. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_errors.py +0 -0
  44. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_parity.py +0 -0
  45. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_stats.py +0 -0
  46. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_metadata_insights.py +0 -0
  47. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_mst.py +0 -0
  48. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_mutual_info.py +0 -0
  49. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_nullmodel.py +0 -0
  50. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_parity.py +0 -0
  51. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_rank.py +0 -0
  52. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_uap.py +0 -0
  53. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/__init__.py +0 -0
  54. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_bovw.py +0 -0
  55. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_flatten.py +0 -0
  56. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_onnx.py +0 -0
  57. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_torch.py +0 -0
  58. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_uncertainty.py +0 -0
  59. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/flags.py +0 -0
  60. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/__init__.py +0 -0
  61. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_aggregator.py +0 -0
  62. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_output.py +0 -0
  63. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_sufficiency.py +0 -0
  64. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/schedules.py +0 -0
  65. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/protocols.py +0 -0
  66. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/py.typed +0 -0
  67. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/__init__.py +0 -0
  68. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_outliers.py +0 -0
  69. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_prioritize.py +0 -0
  70. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_results.py +0 -0
  71. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/__init__.py +0 -0
  72. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_classbalance.py +0 -0
  73. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_classfilter.py +0 -0
  74. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_indices.py +0 -0
  75. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_limit.py +0 -0
  76. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_reverse.py +0 -0
  77. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_select.py +0 -0
  78. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_shuffle.py +0 -0
  79. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/__init__.py +0 -0
  80. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/__init__.py +0 -0
  81. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_base.py +0 -0
  82. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_chunk.py +0 -0
  83. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_mmd.py +0 -0
  84. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_mvdc.py +0 -0
  85. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_thresholds.py +0 -0
  86. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_univariate.py +0 -0
  87. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/__init__.py +0 -0
  88. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_base.py +0 -0
  89. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  90. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  91. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/update_strategies.py +0 -0
  92. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/types.py +0 -0
  93. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/__init__.py +0 -0
  94. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/arrays.py +0 -0
  95. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/losses.py +0 -0
  96. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/models.py +0 -0
  97. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/onnx.py +0 -0
  98. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/preprocessing.py +0 -0
  99. {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/training.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.0rc2
3
+ Version: 1.0.0rc3
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -1250,6 +1250,7 @@ class Metadata(Array, FeatureExtractor):
1250
1250
  scores = []
1251
1251
  srcidx = []
1252
1252
  datum_count = len(self._dataset)
1253
+ _logger.info("Processing metadata for %d dataset items", datum_count)
1253
1254
 
1254
1255
  self._has_targets = self._process_targets(raw, labels, bboxes, scores, srcidx, datum_count, progress_callback)
1255
1256
 
@@ -1259,6 +1260,21 @@ class Metadata(Array, FeatureExtractor):
1259
1260
  bboxes = np_asarray(bboxes, dtype=np.float32) if self._has_targets else None
1260
1261
  srcidx = np.asarray(srcidx, dtype=np.intp)
1261
1262
 
1263
+ n_classes = len(np.unique(labels)) if len(labels) else 0
1264
+ if self._has_targets:
1265
+ _logger.info(
1266
+ "Object Detection dataset: %d images, %d classes, %d detections",
1267
+ datum_count,
1268
+ n_classes,
1269
+ len(labels),
1270
+ )
1271
+ else:
1272
+ _logger.info(
1273
+ "Image Classification dataset: %d images, %d classes",
1274
+ datum_count,
1275
+ n_classes,
1276
+ )
1277
+
1262
1278
  index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
1263
1279
  target_idx = self._compute_target_indices(srcidx, datum_count, bool(self._has_targets))
1264
1280
  reserved = ["image_index", "target_index", "class_label", "score", "box"]
@@ -1314,6 +1330,12 @@ class Metadata(Array, FeatureExtractor):
1314
1330
 
1315
1331
  self._dataframe = pl.DataFrame(combined_rows)
1316
1332
  self._is_structured = True
1333
+ _logger.debug(
1334
+ "Metadata structured: %d image factors, %d target factors, %d dropped",
1335
+ len(self._image_factors),
1336
+ len(self._target_factors),
1337
+ sum(len(v) for v in self._dropped_factors.values()),
1338
+ )
1317
1339
 
1318
1340
  # Build _factors dict from stored factor dictionaries
1319
1341
  self._build_factors()
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '1.0.0rc2'
32
- __version_tuple__ = version_tuple = (1, 0, 0, 'rc2')
31
+ __version__ = version = '1.0.0rc3'
32
+ __version_tuple__ = version_tuple = (1, 0, 0, 'rc3')
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -167,27 +167,32 @@ def _collect_calculator_stats(
167
167
  datum: NDArray[Any],
168
168
  box: BoundingBox | None,
169
169
  per_channel: bool,
170
- ) -> tuple[list[dict[str, list[Any]]], dict[str, Any]]:
170
+ ) -> tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]:
171
171
  """
172
172
  Collect stats from all calculators.
173
173
 
174
174
  Returns
175
175
  -------
176
- tuple[list[dict[str, list[Any]]], dict[str, Any]]
177
- A tuple of (stats_list, empty_values_map) where:
176
+ tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]
177
+ A tuple of (stats_list, empty_values_map, warnings) where:
178
178
  - stats_list: List of computed stats from each calculator
179
179
  - empty_values_map: Mapping of stat names to their empty values (defaults to np.nan)
180
+ - warnings: List of warning messages from calculators
180
181
  """
181
182
  stats_list = []
182
183
  empty_values_map: dict[str, Any] = {}
184
+ warnings: list[str] = []
183
185
  processor = CalculatorCache(datum, box, per_channel)
184
186
  for calculator_cls, flags in calculators:
185
187
  calculator = calculator_cls(datum, processor, per_channel)
186
188
  stats_list.append(calculator.compute(flags))
187
189
  # Collect empty values from this calculator
188
190
  empty_values_map.update(calculator.get_empty_values())
191
+ # Collect warnings from this calculator
192
+ if hasattr(calculator, "warnings"):
193
+ warnings.extend(calculator.warnings)
189
194
  del calculator
190
- return stats_list, empty_values_map
195
+ return stats_list, empty_values_map, warnings
191
196
 
192
197
 
193
198
  def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], num_channels: int) -> list[int | None]:
@@ -303,10 +308,18 @@ def _calculate_datum(
303
308
  box_count += 1
304
309
  if not box.is_clippable():
305
310
  invalid_box_count += 1
306
- warnings_list.append(f"Bounding box [{i}][{i_b}]: {box} for datum shape {datum.shape} is invalid.")
311
+ source = f"{i}[{i_b}]"
312
+ warnings_list.append(f"{source}: Bounding box {box} for datum shape {datum.shape} is invalid")
307
313
 
308
314
  # Collect stats from all calculators
309
- calculator_stats, empty_values_map = _collect_calculator_stats(calculators, datum, box, per_channel)
315
+ calculator_stats, empty_values_map, calc_warnings = _collect_calculator_stats(
316
+ calculators, datum, box, per_channel
317
+ )
318
+
319
+ # Thread calculator warnings with index context
320
+ for w in calc_warnings:
321
+ source = f"{i}" if box is None else f"{i}[{i_b}]"
322
+ warnings_list.append(f"{source}: {w}")
310
323
 
311
324
  # Determine what channel indices are needed
312
325
  sorted_channels = _determine_channel_indices(calculator_stats, num_channels)
@@ -505,8 +518,14 @@ def calculate(
505
518
  # Get calculators from registry based on flags
506
519
  calculators = CalculatorRegistry.get_calculators(stats)
507
520
 
521
+ # Log the individual flags that will be computed
522
+ resolved_names = [
523
+ f.name for f in type(stats) if f in stats and f.name and f.value and (f.value & (f.value - 1)) == 0
524
+ ]
508
525
  _logger.info(
509
- "Starting calculate with per_image=%s, per_target=%s, per_channel=%s",
526
+ "Starting calculate: %d stats [%s], per_image=%s, per_target=%s, per_channel=%s",
527
+ len(resolved_names),
528
+ ", ".join(resolved_names),
510
529
  per_image,
511
530
  per_target,
512
531
  per_channel,
@@ -543,8 +562,13 @@ def calculate(
543
562
  if progress_callback:
544
563
  progress_callback(image_count, total=total_images)
545
564
 
565
+ # Aggregate warnings by message type, collecting indices per type
566
+ grouped_warnings: dict[str, list[str]] = {}
546
567
  for w in warning_list:
547
- _logger.warning(w)
568
+ idx, _, msg = w.partition(": ")
569
+ grouped_warnings.setdefault(msg, []).append(idx)
570
+ for msg, indices in grouped_warnings.items():
571
+ _logger.warning("%s — indices: %s", msg, ", ".join(indices))
548
572
 
549
573
  _logger.debug("Sorting %d source indices and %d stats", len(source_indices), len(aggregated_stats))
550
574
  sorted_source_indices, sorted_aggregated_stats = _sort(source_indices, aggregated_stats)
@@ -28,35 +28,42 @@ class HashStatCalculator(Calculator):
28
28
  def __init__(self, datum: NDArray[Any], cache: "CalculatorCache", per_channel: bool = False) -> None: # noqa: ARG002
29
29
  self.datum = datum
30
30
  self.cache = cache
31
+ self.warnings: list[str] = []
31
32
 
32
33
  def get_applicable_flags(self) -> ImageStats:
33
34
  """Return which flags this calculator handles."""
34
35
  return ImageStats.HASH
35
36
 
36
- def _xxhash(self) -> list[str]:
37
- from dataeval.core._hash import xxhash
37
+ def _collect(self, result: tuple[str, str | None]) -> list[str]:
38
+ hash_value, warning = result
39
+ if warning:
40
+ self.warnings.append(warning)
41
+ return [hash_value]
38
42
 
39
- return [xxhash(self.cache.image)]
43
+ def _compute_xxhash(self) -> list[str]:
44
+ from dataeval.core._hash import _xxhash
40
45
 
41
- def _phash(self) -> list[str]:
42
- from dataeval.core._hash import phash
46
+ return self._collect(_xxhash(self.cache.image))
43
47
 
44
- return [phash(self.cache.image)]
48
+ def _compute_phash(self) -> list[str]:
49
+ from dataeval.core._hash import _phash
45
50
 
46
- def _phash_d4(self) -> list[str]:
47
- from dataeval.core._hash import phash_d4
51
+ return self._collect(_phash(self.cache.image))
48
52
 
49
- return [phash_d4(self.cache.image)]
53
+ def _compute_phash_d4(self) -> list[str]:
54
+ from dataeval.core._hash import _phash_d4
50
55
 
51
- def _dhash(self) -> list[str]:
52
- from dataeval.core._hash import dhash
56
+ return self._collect(_phash_d4(self.cache.image))
53
57
 
54
- return [dhash(self.cache.image)]
58
+ def _compute_dhash(self) -> list[str]:
59
+ from dataeval.core._hash import _dhash
55
60
 
56
- def _dhash_d4(self) -> list[str]:
57
- from dataeval.core._hash import dhash_d4
61
+ return self._collect(_dhash(self.cache.image))
58
62
 
59
- return [dhash_d4(self.cache.image)]
63
+ def _compute_dhash_d4(self) -> list[str]:
64
+ from dataeval.core._hash import _dhash_d4
65
+
66
+ return self._collect(_dhash_d4(self.cache.image))
60
67
 
61
68
  def get_empty_values(self) -> dict[str, Any]:
62
69
  """Return empty values for hash statistics."""
@@ -71,9 +78,9 @@ class HashStatCalculator(Calculator):
71
78
  def get_handlers(self) -> dict[ImageStats, tuple[str, Callable[[], list[Any]]]]:
72
79
  """Return mapping of flags to (stat_name, handler_function)."""
73
80
  return {
74
- ImageStats.HASH_XXHASH: ("xxhash", self._xxhash),
75
- ImageStats.HASH_PHASH: ("phash", self._phash),
76
- ImageStats.HASH_DHASH: ("dhash", self._dhash),
77
- ImageStats.HASH_PHASH_D4: ("phash_d4", self._phash_d4),
78
- ImageStats.HASH_DHASH_D4: ("dhash_d4", self._dhash_d4),
81
+ ImageStats.HASH_XXHASH: ("xxhash", self._compute_xxhash),
82
+ ImageStats.HASH_PHASH: ("phash", self._compute_phash),
83
+ ImageStats.HASH_DHASH: ("dhash", self._compute_dhash),
84
+ ImageStats.HASH_PHASH_D4: ("phash_d4", self._compute_phash_d4),
85
+ ImageStats.HASH_DHASH_D4: ("dhash_d4", self._compute_dhash_d4),
79
86
  }
@@ -20,7 +20,7 @@ HASH_SIZE = 8
20
20
  MAX_FACTOR = 4
21
21
 
22
22
 
23
- def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArray[np.uint8] | None:
23
+ def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> tuple[NDArray[np.uint8] | None, str | None]:
24
24
  """
25
25
  Prepare an image for perceptual hashing by normalizing and converting to grayscale.
26
26
 
@@ -33,65 +33,36 @@ def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArra
33
33
 
34
34
  Returns
35
35
  -------
36
- NDArray[np.uint8] | None
37
- Grayscale image ready for hashing, or None if image is unsuitable.
36
+ tuple[NDArray[np.uint8] | None, str | None]
37
+ A tuple of (grayscale_image, warning). On success, warning is None.
38
+ On failure, grayscale_image is None and warning describes the reason.
38
39
  """
39
40
  image_np = as_numpy(image)
40
41
 
41
42
  # Perceptual hashing only works on spatial data (2D or higher)
42
43
  if image_np.ndim < 2:
43
- _logger.warning("Perceptual hashing requires spatial data (2D or higher dimensions)")
44
- return None
44
+ return None, "Perceptual hashing requires spatial data (2D or higher dimensions)"
45
45
 
46
46
  # Verify that the image is at least larger than minimum size
47
47
  min_dim = min(image_np.shape[-2:])
48
48
  if min_dim < min_size:
49
- _logger.warning("Image too small for perceptual hashing: min_dim=%d", min_dim)
50
- return None
49
+ return None, "Image too small for perceptual hashing"
51
50
 
52
51
  # Normalize the image shape to CxHxW
53
52
  normalized = normalize_image_shape(image_np)
54
53
 
55
54
  # Convert to single-channel grayscale image
56
- return to_canonical_grayscale(normalized)
55
+ return to_canonical_grayscale(normalized), None
57
56
 
58
57
 
59
- def phash(image: Array3D[Any]) -> str:
60
- """
61
- Compute perceptual hash using Discrete Cosine Transform (DCT).
62
-
63
- Resizes image to a square NxN using Lanczos algorithm where N is 32x32
64
- or the largest multiple of 8 smaller than input dimensions. The resampled
65
- image is compressed using DCT and the lowest frequency component is encoded
66
- as a bit array of greater or less than median value.
67
-
68
- Parameters
69
- ----------
70
- image : Array3D
71
- An image in CxHxW format. Can be a 3D list, or array-like object.
72
-
73
- Returns
74
- -------
75
- str
76
- Hex string hash of the image, or empty string if image is too small
77
- or not spatial data.
78
-
79
- Notes
80
- -----
81
- DCT-based hashing (pHash) is robust to:
82
- - Scaling and resizing
83
- - Minor color adjustments
84
- - Compression artifacts
85
-
86
- It captures frequency information, making it effective for detecting
87
- images that have been resized or slightly modified.
88
- """
58
+ def _phash(image: Array3D[Any]) -> tuple[str, str | None]:
59
+ """Compute perceptual hash, returning (hash, warning) tuple."""
89
60
  image_np = as_numpy(image)
90
61
  _logger.debug("Computing perceptual hash for image with shape: %s", image_np.shape)
91
62
 
92
- grayscale = _prepare_image(image_np)
63
+ grayscale, warning = _prepare_image(image_np)
93
64
  if grayscale is None:
94
- return ""
65
+ return "", warning
95
66
 
96
67
  # Calculates the dimensions of the resized square image
97
68
  min_dim = min(image_np.shape[-2:])
@@ -112,43 +83,17 @@ def phash(image: Array3D[Any]) -> str:
112
83
  hash_hex = np.packbits(padded).tobytes().hex()
113
84
  result = hash_hex if hash_hex else "0"
114
85
  _logger.debug("Perceptual hash computed: %s", result[:16] + "..." if len(result) > 16 else result)
115
- return result
86
+ return result, None
116
87
 
117
88
 
118
- def dhash(image: Array3D[Any]) -> str:
119
- """
120
- Compute difference hash (dHash) for an image.
121
-
122
- Resizes then crops image to 9x8 grayscale and computes horizontal gradient
123
- by comparing adjacent pixels, producing a 64-bit hash. Captures relative
124
- brightness changes rather than absolute values.
125
-
126
- Parameters
127
- ----------
128
- image : Array3D
129
- An image in CxHxW format. Can be a 3D list, or array-like object.
130
-
131
- Returns
132
- -------
133
- str
134
- Hex string hash of the image, or empty string if image is too small
135
- or not spatial data.
136
-
137
- Notes
138
- -----
139
- Difference hash captures gradient information:
140
- - Captures structural information via pixel transitions
141
- - Complementary to DCT-based pHash (frequency vs gradient domain)
142
-
143
- The horizontal gradient approach makes it particularly effective for
144
- detecting cropped or slightly shifted versions of images.
145
- """
89
+ def _dhash(image: Array3D[Any]) -> tuple[str, str | None]:
90
+ """Compute difference hash, returning (hash, warning) tuple."""
146
91
  image_np = as_numpy(image)
147
92
  _logger.debug("Computing difference hash for image with shape: %s", image_np.shape)
148
93
 
149
- grayscale = _prepare_image(image_np)
94
+ grayscale, warning = _prepare_image(image_np)
150
95
  if grayscale is None:
151
- return ""
96
+ return "", warning
152
97
 
153
98
  # Resize to 9x8 (9 wide to get 8 differences)
154
99
  im = resize(grayscale, HASH_SIZE + 1)
@@ -162,7 +107,7 @@ def dhash(image: Array3D[Any]) -> str:
162
107
  hash_hex = np.packbits(diff.flatten()).tobytes().hex()
163
108
  result = hash_hex if hash_hex else "0"
164
109
  _logger.debug("Difference hash computed: %s", result)
165
- return result
110
+ return result, None
166
111
 
167
112
 
168
113
  def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
@@ -191,43 +136,8 @@ def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
191
136
  return transforms
192
137
 
193
138
 
194
- def phash_d4(image: Array3D[Any]) -> str:
195
- """
196
- Compute orientation-invariant perceptual hash using DCT.
197
-
198
- Computes phash for all 8 dihedral group transformations (4 rotations ×
199
- 2 flip states) and returns the lexicographically smallest hash as the
200
- canonical representative.
201
-
202
- Parameters
203
- ----------
204
- image : Array3D
205
- An image in CxHxW format. Can be a 3D list, or array-like object.
206
-
207
- Returns
208
- -------
209
- str
210
- Canonical hex string hash invariant to rotation and mirroring,
211
- or empty string if image is too small or not spatial data.
212
-
213
- Notes
214
- -----
215
- This hash is invariant to:
216
- - 90°, 180°, 270° rotations
217
- - Horizontal and vertical flips
218
- - Any combination of rotation and flip
219
-
220
- The canonical hash is the lexicographically smallest hash among all
221
- 8 orientations, ensuring that any orientation of the same image
222
- produces the identical hash.
223
-
224
- Computation cost is ~8x that of regular phash.
225
-
226
- See Also
227
- --------
228
- phash : Standard orientation-sensitive perceptual hash
229
- dhash_d4 : Orientation-invariant difference hash
230
- """
139
+ def _phash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
140
+ """Compute orientation-invariant perceptual hash, returning (hash, warning) tuple."""
231
141
  from scipy.fftpack import dct
232
142
 
233
143
  from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
@@ -239,11 +149,11 @@ def phash_d4(image: Array3D[Any]) -> str:
239
149
 
240
150
  # Validate input
241
151
  if image_np.ndim < 2:
242
- return ""
152
+ return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
243
153
 
244
154
  min_dim = min(image_np.shape[-2:])
245
155
  if min_dim < hash_size + 1:
246
- return ""
156
+ return "", "Image too small for perceptual hashing"
247
157
 
248
158
  # Prepare grayscale image
249
159
  normalized = normalize_image_shape(image_np)
@@ -269,14 +179,133 @@ def phash_d4(image: Array3D[Any]) -> str:
269
179
  hashes.append(hash_hex if hash_hex else "0")
270
180
 
271
181
  # Return canonical (lexicographically smallest) hash
272
- return min(hashes)
182
+ return min(hashes), None
273
183
 
274
184
 
275
- def dhash_d4(image: Array3D[Any]) -> str:
185
+ def _dhash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
186
+ """Compute orientation-invariant difference hash, returning (hash, warning) tuple."""
187
+ from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
188
+
189
+ hash_size = 8
190
+
191
+ image_np = as_numpy(image)
192
+
193
+ # Validate input
194
+ if image_np.ndim < 2:
195
+ return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
196
+
197
+ min_dim = min(image_np.shape[-2:])
198
+ if min_dim < hash_size + 1:
199
+ return "", "Image too small for perceptual hashing"
200
+
201
+ # Prepare grayscale image
202
+ normalized = normalize_image_shape(image_np)
203
+ grayscale = to_canonical_grayscale(normalized)
204
+
205
+ # Compute hash for each D4 transformation
206
+ hashes: list[str] = []
207
+ for transformed in _get_d4_transforms(grayscale):
208
+ # Resize to 9x8 (9 wide to get 8 horizontal differences)
209
+ im = resize(transformed, hash_size + 1)
210
+ im = im[:hash_size, : hash_size + 1]
211
+
212
+ # Compute horizontal gradient
213
+ diff = im[:, :-1] > im[:, 1:]
214
+
215
+ # Convert to hex
216
+ hash_hex = np.packbits(diff.flatten()).tobytes().hex()
217
+ hashes.append(hash_hex if hash_hex else "0")
218
+
219
+ # Return canonical (lexicographically smallest) hash
220
+ return min(hashes), None
221
+
222
+
223
+ def _xxhash(image: Array3D[Any]) -> tuple[str, str | None]:
224
+ """Compute xxhash, returning (hash, warning) tuple."""
225
+ image_np = as_numpy(image)
226
+ _logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
227
+ hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
228
+ _logger.debug("xxhash computed: %s", hash_result)
229
+ return hash_result, None
230
+
231
+
232
+ def _log_and_return(result: tuple[str, str | None]) -> str:
233
+ """Log the warning (if any) and return just the hash string."""
234
+ hash_value, warning = result
235
+ if warning:
236
+ _logger.warning(warning)
237
+ return hash_value
238
+
239
+
240
+ def phash(image: Array3D[Any]) -> str:
276
241
  """
277
- Compute orientation-invariant difference hash using gradients.
242
+ Compute perceptual hash using Discrete Cosine Transform (DCT).
278
243
 
279
- Computes dhash for all 8 dihedral group transformations (4 rotations ×
244
+ Resizes image to a square NxN using Lanczos algorithm where N is 32x32
245
+ or the largest multiple of 8 smaller than input dimensions. The resampled
246
+ image is compressed using DCT and the lowest frequency component is encoded
247
+ as a bit array of greater or less than median value.
248
+
249
+ Parameters
250
+ ----------
251
+ image : Array3D
252
+ An image in CxHxW format. Can be a 3D list, or array-like object.
253
+
254
+ Returns
255
+ -------
256
+ str
257
+ Hex string hash of the image, or empty string if image is too small
258
+ or not spatial data.
259
+
260
+ Notes
261
+ -----
262
+ DCT-based hashing (pHash) is robust to:
263
+ - Scaling and resizing
264
+ - Minor color adjustments
265
+ - Compression artifacts
266
+
267
+ It captures frequency information, making it effective for detecting
268
+ images that have been resized or slightly modified.
269
+ """
270
+ return _log_and_return(_phash(image))
271
+
272
+
273
+ def dhash(image: Array3D[Any]) -> str:
274
+ """
275
+ Compute difference hash (dHash) for an image.
276
+
277
+ Resizes then crops image to 9x8 grayscale and computes horizontal gradient
278
+ by comparing adjacent pixels, producing a 64-bit hash. Captures relative
279
+ brightness changes rather than absolute values.
280
+
281
+ Parameters
282
+ ----------
283
+ image : Array3D
284
+ An image in CxHxW format. Can be a 3D list, or array-like object.
285
+
286
+ Returns
287
+ -------
288
+ str
289
+ Hex string hash of the image, or empty string if image is too small
290
+ or not spatial data.
291
+
292
+ Notes
293
+ -----
294
+ Difference hash captures gradient information:
295
+ - Captures structural information via pixel transitions
296
+ - Complementary to DCT-based pHash (frequency vs gradient domain)
297
+
298
+ The horizontal gradient approach makes it particularly effective for
299
+ detecting cropped or slightly shifted versions of images.
300
+ """
301
+ return _log_and_return(_dhash(image))
302
+
303
+
304
+ def phash_d4(image: Array3D[Any]) -> str:
305
+ """
306
+ Compute orientation-invariant perceptual hash using DCT.
307
+
308
+ Computes phash for all 8 dihedral group transformations (4 rotations ×
280
309
  2 flip states) and returns the lexicographically smallest hash as the
281
310
  canonical representative.
282
311
 
@@ -302,47 +331,54 @@ def dhash_d4(image: Array3D[Any]) -> str:
302
331
  8 orientations, ensuring that any orientation of the same image
303
332
  produces the identical hash.
304
333
 
305
- Computation cost is ~8x that of regular dhash.
334
+ Computation cost is ~8x that of regular phash.
306
335
 
307
336
  See Also
308
337
  --------
309
- dhash : Standard orientation-sensitive difference hash
310
- phash_d4 : Orientation-invariant perceptual hash
338
+ phash : Standard orientation-sensitive perceptual hash
339
+ dhash_d4 : Orientation-invariant difference hash
311
340
  """
312
- from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
341
+ return _log_and_return(_phash_d4(image))
313
342
 
314
- hash_size = 8
315
343
 
316
- image_np = as_numpy(image)
344
+ def dhash_d4(image: Array3D[Any]) -> str:
345
+ """
346
+ Compute orientation-invariant difference hash using gradients.
317
347
 
318
- # Validate input
319
- if image_np.ndim < 2:
320
- return ""
348
+ Computes dhash for all 8 dihedral group transformations (4 rotations ×
349
+ 2 flip states) and returns the lexicographically smallest hash as the
350
+ canonical representative.
321
351
 
322
- min_dim = min(image_np.shape[-2:])
323
- if min_dim < hash_size + 1:
324
- return ""
352
+ Parameters
353
+ ----------
354
+ image : Array3D
355
+ An image in CxHxW format. Can be a 3D list, or array-like object.
325
356
 
326
- # Prepare grayscale image
327
- normalized = normalize_image_shape(image_np)
328
- grayscale = to_canonical_grayscale(normalized)
357
+ Returns
358
+ -------
359
+ str
360
+ Canonical hex string hash invariant to rotation and mirroring,
361
+ or empty string if image is too small or not spatial data.
329
362
 
330
- # Compute hash for each D4 transformation
331
- hashes: list[str] = []
332
- for transformed in _get_d4_transforms(grayscale):
333
- # Resize to 9x8 (9 wide to get 8 horizontal differences)
334
- im = resize(transformed, hash_size + 1)
335
- im = im[:hash_size, : hash_size + 1]
363
+ Notes
364
+ -----
365
+ This hash is invariant to:
366
+ - 90°, 180°, 270° rotations
367
+ - Horizontal and vertical flips
368
+ - Any combination of rotation and flip
336
369
 
337
- # Compute horizontal gradient
338
- diff = im[:, :-1] > im[:, 1:]
370
+ The canonical hash is the lexicographically smallest hash among all
371
+ 8 orientations, ensuring that any orientation of the same image
372
+ produces the identical hash.
339
373
 
340
- # Convert to hex
341
- hash_hex = np.packbits(diff.flatten()).tobytes().hex()
342
- hashes.append(hash_hex if hash_hex else "0")
374
+ Computation cost is ~8x that of regular dhash.
343
375
 
344
- # Return canonical (lexicographically smallest) hash
345
- return min(hashes)
376
+ See Also
377
+ --------
378
+ dhash : Standard orientation-sensitive difference hash
379
+ phash_d4 : Orientation-invariant perceptual hash
380
+ """
381
+ return _log_and_return(_dhash_d4(image))
346
382
 
347
383
 
348
384
  def xxhash(image: Array3D[Any]) -> str:
@@ -368,11 +404,7 @@ def xxhash(image: Array3D[Any]) -> str:
368
404
  hashes, it will produce completely different values for images that
369
405
  differ by even a single pixel.
370
406
  """
371
- image_np = as_numpy(image)
372
- _logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
373
- hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
374
- _logger.debug("xxhash computed: %s", hash_result)
375
- return hash_result
407
+ return _log_and_return(_xxhash(image))
376
408
 
377
409
 
378
410
  def hamming_distance(hash1: str, hash2: str) -> int:
@@ -2,7 +2,7 @@
2
2
 
3
3
  __all__ = []
4
4
 
5
- from collections.abc import Sequence
5
+ from collections.abc import Iterator, Sequence
6
6
  from dataclasses import dataclass
7
7
  from typing import Any, Generic, Literal, NamedTuple, TypeVar, overload
8
8
 
@@ -75,6 +75,9 @@ class NearDuplicateGroup(Generic[TIndexType]):
75
75
  methods: frozenset[str]
76
76
  orientation: Literal["rotated", "same"] | None = None
77
77
 
78
+ def __iter__(self) -> Iterator[TIndexType]:
79
+ yield from self.indices
80
+
78
81
  def __repr__(self) -> str:
79
82
  orientation = f", orientation={self.orientation}" if self.orientation else ""
80
83
  return f"NearDuplicateGroup({list(self.indices)}, methods={sorted(self.methods)}{orientation})"