dataeval 1.0.0rc2__tar.gz → 1.0.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/PKG-INFO +1 -1
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_metadata.py +22 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_version.py +2 -2
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculate.py +32 -8
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_hashstats.py +27 -20
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_hash.py +180 -148
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_duplicates.py +4 -1
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/data.py +299 -26
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/.gitignore +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/LICENSE +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/README.md +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/pyproject.toml +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_embeddings.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_balance.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_diversity.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/bias/_parity.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/config.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_ber.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_bin.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculate_ratios.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_clusterer.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_completeness.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_coverage.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_diversity.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_fast_hdbscan/_mst.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_feature_distance.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_parity.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_label_stats.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_metadata_insights.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_mst.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_mutual_info.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_nullmodel.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_parity.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/core/_uap.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_bovw.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_flatten.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_onnx.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_torch.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/extractors/_uncertainty.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/flags.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_aggregator.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_output.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/_sufficiency.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/protocols.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_outliers.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_prioritize.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/quality/_results.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_classbalance.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_classfilter.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_indices.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_limit.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_reverse.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_select.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/selection/_shuffle.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_base.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_chunk.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_mmd.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_mvdc.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_thresholds.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_drift/_univariate.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/types.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/arrays.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/models.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/onnx.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/preprocessing.py +0 -0
- {dataeval-1.0.0rc2 → dataeval-1.0.0rc3}/src/dataeval/utils/training.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc3
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -1250,6 +1250,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1250
1250
|
scores = []
|
|
1251
1251
|
srcidx = []
|
|
1252
1252
|
datum_count = len(self._dataset)
|
|
1253
|
+
_logger.info("Processing metadata for %d dataset items", datum_count)
|
|
1253
1254
|
|
|
1254
1255
|
self._has_targets = self._process_targets(raw, labels, bboxes, scores, srcidx, datum_count, progress_callback)
|
|
1255
1256
|
|
|
@@ -1259,6 +1260,21 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1259
1260
|
bboxes = np_asarray(bboxes, dtype=np.float32) if self._has_targets else None
|
|
1260
1261
|
srcidx = np.asarray(srcidx, dtype=np.intp)
|
|
1261
1262
|
|
|
1263
|
+
n_classes = len(np.unique(labels)) if len(labels) else 0
|
|
1264
|
+
if self._has_targets:
|
|
1265
|
+
_logger.info(
|
|
1266
|
+
"Object Detection dataset: %d images, %d classes, %d detections",
|
|
1267
|
+
datum_count,
|
|
1268
|
+
n_classes,
|
|
1269
|
+
len(labels),
|
|
1270
|
+
)
|
|
1271
|
+
else:
|
|
1272
|
+
_logger.info(
|
|
1273
|
+
"Image Classification dataset: %d images, %d classes",
|
|
1274
|
+
datum_count,
|
|
1275
|
+
n_classes,
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1262
1278
|
index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
|
|
1263
1279
|
target_idx = self._compute_target_indices(srcidx, datum_count, bool(self._has_targets))
|
|
1264
1280
|
reserved = ["image_index", "target_index", "class_label", "score", "box"]
|
|
@@ -1314,6 +1330,12 @@ class Metadata(Array, FeatureExtractor):
|
|
|
1314
1330
|
|
|
1315
1331
|
self._dataframe = pl.DataFrame(combined_rows)
|
|
1316
1332
|
self._is_structured = True
|
|
1333
|
+
_logger.debug(
|
|
1334
|
+
"Metadata structured: %d image factors, %d target factors, %d dropped",
|
|
1335
|
+
len(self._image_factors),
|
|
1336
|
+
len(self._target_factors),
|
|
1337
|
+
sum(len(v) for v in self._dropped_factors.values()),
|
|
1338
|
+
)
|
|
1317
1339
|
|
|
1318
1340
|
# Build _factors dict from stored factor dictionaries
|
|
1319
1341
|
self._build_factors()
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '1.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (1, 0, 0, '
|
|
31
|
+
__version__ = version = '1.0.0rc3'
|
|
32
|
+
__version_tuple__ = version_tuple = (1, 0, 0, 'rc3')
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -167,27 +167,32 @@ def _collect_calculator_stats(
|
|
|
167
167
|
datum: NDArray[Any],
|
|
168
168
|
box: BoundingBox | None,
|
|
169
169
|
per_channel: bool,
|
|
170
|
-
) -> tuple[list[dict[str, list[Any]]], dict[str, Any]]:
|
|
170
|
+
) -> tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]:
|
|
171
171
|
"""
|
|
172
172
|
Collect stats from all calculators.
|
|
173
173
|
|
|
174
174
|
Returns
|
|
175
175
|
-------
|
|
176
|
-
tuple[list[dict[str, list[Any]]], dict[str, Any]]
|
|
177
|
-
A tuple of (stats_list, empty_values_map) where:
|
|
176
|
+
tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]
|
|
177
|
+
A tuple of (stats_list, empty_values_map, warnings) where:
|
|
178
178
|
- stats_list: List of computed stats from each calculator
|
|
179
179
|
- empty_values_map: Mapping of stat names to their empty values (defaults to np.nan)
|
|
180
|
+
- warnings: List of warning messages from calculators
|
|
180
181
|
"""
|
|
181
182
|
stats_list = []
|
|
182
183
|
empty_values_map: dict[str, Any] = {}
|
|
184
|
+
warnings: list[str] = []
|
|
183
185
|
processor = CalculatorCache(datum, box, per_channel)
|
|
184
186
|
for calculator_cls, flags in calculators:
|
|
185
187
|
calculator = calculator_cls(datum, processor, per_channel)
|
|
186
188
|
stats_list.append(calculator.compute(flags))
|
|
187
189
|
# Collect empty values from this calculator
|
|
188
190
|
empty_values_map.update(calculator.get_empty_values())
|
|
191
|
+
# Collect warnings from this calculator
|
|
192
|
+
if hasattr(calculator, "warnings"):
|
|
193
|
+
warnings.extend(calculator.warnings)
|
|
189
194
|
del calculator
|
|
190
|
-
return stats_list, empty_values_map
|
|
195
|
+
return stats_list, empty_values_map, warnings
|
|
191
196
|
|
|
192
197
|
|
|
193
198
|
def _determine_channel_indices(calculator_output: list[dict[str, list[Any]]], num_channels: int) -> list[int | None]:
|
|
@@ -303,10 +308,18 @@ def _calculate_datum(
|
|
|
303
308
|
box_count += 1
|
|
304
309
|
if not box.is_clippable():
|
|
305
310
|
invalid_box_count += 1
|
|
306
|
-
|
|
311
|
+
source = f"{i}[{i_b}]"
|
|
312
|
+
warnings_list.append(f"{source}: Bounding box {box} for datum shape {datum.shape} is invalid")
|
|
307
313
|
|
|
308
314
|
# Collect stats from all calculators
|
|
309
|
-
calculator_stats, empty_values_map = _collect_calculator_stats(
|
|
315
|
+
calculator_stats, empty_values_map, calc_warnings = _collect_calculator_stats(
|
|
316
|
+
calculators, datum, box, per_channel
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Thread calculator warnings with index context
|
|
320
|
+
for w in calc_warnings:
|
|
321
|
+
source = f"{i}" if box is None else f"{i}[{i_b}]"
|
|
322
|
+
warnings_list.append(f"{source}: {w}")
|
|
310
323
|
|
|
311
324
|
# Determine what channel indices are needed
|
|
312
325
|
sorted_channels = _determine_channel_indices(calculator_stats, num_channels)
|
|
@@ -505,8 +518,14 @@ def calculate(
|
|
|
505
518
|
# Get calculators from registry based on flags
|
|
506
519
|
calculators = CalculatorRegistry.get_calculators(stats)
|
|
507
520
|
|
|
521
|
+
# Log the individual flags that will be computed
|
|
522
|
+
resolved_names = [
|
|
523
|
+
f.name for f in type(stats) if f in stats and f.name and f.value and (f.value & (f.value - 1)) == 0
|
|
524
|
+
]
|
|
508
525
|
_logger.info(
|
|
509
|
-
"Starting calculate
|
|
526
|
+
"Starting calculate: %d stats [%s], per_image=%s, per_target=%s, per_channel=%s",
|
|
527
|
+
len(resolved_names),
|
|
528
|
+
", ".join(resolved_names),
|
|
510
529
|
per_image,
|
|
511
530
|
per_target,
|
|
512
531
|
per_channel,
|
|
@@ -543,8 +562,13 @@ def calculate(
|
|
|
543
562
|
if progress_callback:
|
|
544
563
|
progress_callback(image_count, total=total_images)
|
|
545
564
|
|
|
565
|
+
# Aggregate warnings by message type, collecting indices per type
|
|
566
|
+
grouped_warnings: dict[str, list[str]] = {}
|
|
546
567
|
for w in warning_list:
|
|
547
|
-
|
|
568
|
+
idx, _, msg = w.partition(": ")
|
|
569
|
+
grouped_warnings.setdefault(msg, []).append(idx)
|
|
570
|
+
for msg, indices in grouped_warnings.items():
|
|
571
|
+
_logger.warning("%s — indices: %s", msg, ", ".join(indices))
|
|
548
572
|
|
|
549
573
|
_logger.debug("Sorting %d source indices and %d stats", len(source_indices), len(aggregated_stats))
|
|
550
574
|
sorted_source_indices, sorted_aggregated_stats = _sort(source_indices, aggregated_stats)
|
|
@@ -28,35 +28,42 @@ class HashStatCalculator(Calculator):
|
|
|
28
28
|
def __init__(self, datum: NDArray[Any], cache: "CalculatorCache", per_channel: bool = False) -> None: # noqa: ARG002
|
|
29
29
|
self.datum = datum
|
|
30
30
|
self.cache = cache
|
|
31
|
+
self.warnings: list[str] = []
|
|
31
32
|
|
|
32
33
|
def get_applicable_flags(self) -> ImageStats:
|
|
33
34
|
"""Return which flags this calculator handles."""
|
|
34
35
|
return ImageStats.HASH
|
|
35
36
|
|
|
36
|
-
def
|
|
37
|
-
|
|
37
|
+
def _collect(self, result: tuple[str, str | None]) -> list[str]:
|
|
38
|
+
hash_value, warning = result
|
|
39
|
+
if warning:
|
|
40
|
+
self.warnings.append(warning)
|
|
41
|
+
return [hash_value]
|
|
38
42
|
|
|
39
|
-
|
|
43
|
+
def _compute_xxhash(self) -> list[str]:
|
|
44
|
+
from dataeval.core._hash import _xxhash
|
|
40
45
|
|
|
41
|
-
|
|
42
|
-
from dataeval.core._hash import phash
|
|
46
|
+
return self._collect(_xxhash(self.cache.image))
|
|
43
47
|
|
|
44
|
-
|
|
48
|
+
def _compute_phash(self) -> list[str]:
|
|
49
|
+
from dataeval.core._hash import _phash
|
|
45
50
|
|
|
46
|
-
|
|
47
|
-
from dataeval.core._hash import phash_d4
|
|
51
|
+
return self._collect(_phash(self.cache.image))
|
|
48
52
|
|
|
49
|
-
|
|
53
|
+
def _compute_phash_d4(self) -> list[str]:
|
|
54
|
+
from dataeval.core._hash import _phash_d4
|
|
50
55
|
|
|
51
|
-
|
|
52
|
-
from dataeval.core._hash import dhash
|
|
56
|
+
return self._collect(_phash_d4(self.cache.image))
|
|
53
57
|
|
|
54
|
-
|
|
58
|
+
def _compute_dhash(self) -> list[str]:
|
|
59
|
+
from dataeval.core._hash import _dhash
|
|
55
60
|
|
|
56
|
-
|
|
57
|
-
from dataeval.core._hash import dhash_d4
|
|
61
|
+
return self._collect(_dhash(self.cache.image))
|
|
58
62
|
|
|
59
|
-
|
|
63
|
+
def _compute_dhash_d4(self) -> list[str]:
|
|
64
|
+
from dataeval.core._hash import _dhash_d4
|
|
65
|
+
|
|
66
|
+
return self._collect(_dhash_d4(self.cache.image))
|
|
60
67
|
|
|
61
68
|
def get_empty_values(self) -> dict[str, Any]:
|
|
62
69
|
"""Return empty values for hash statistics."""
|
|
@@ -71,9 +78,9 @@ class HashStatCalculator(Calculator):
|
|
|
71
78
|
def get_handlers(self) -> dict[ImageStats, tuple[str, Callable[[], list[Any]]]]:
|
|
72
79
|
"""Return mapping of flags to (stat_name, handler_function)."""
|
|
73
80
|
return {
|
|
74
|
-
ImageStats.HASH_XXHASH: ("xxhash", self.
|
|
75
|
-
ImageStats.HASH_PHASH: ("phash", self.
|
|
76
|
-
ImageStats.HASH_DHASH: ("dhash", self.
|
|
77
|
-
ImageStats.HASH_PHASH_D4: ("phash_d4", self.
|
|
78
|
-
ImageStats.HASH_DHASH_D4: ("dhash_d4", self.
|
|
81
|
+
ImageStats.HASH_XXHASH: ("xxhash", self._compute_xxhash),
|
|
82
|
+
ImageStats.HASH_PHASH: ("phash", self._compute_phash),
|
|
83
|
+
ImageStats.HASH_DHASH: ("dhash", self._compute_dhash),
|
|
84
|
+
ImageStats.HASH_PHASH_D4: ("phash_d4", self._compute_phash_d4),
|
|
85
|
+
ImageStats.HASH_DHASH_D4: ("dhash_d4", self._compute_dhash_d4),
|
|
79
86
|
}
|
|
@@ -20,7 +20,7 @@ HASH_SIZE = 8
|
|
|
20
20
|
MAX_FACTOR = 4
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArray[np.uint8] | None:
|
|
23
|
+
def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> tuple[NDArray[np.uint8] | None, str | None]:
|
|
24
24
|
"""
|
|
25
25
|
Prepare an image for perceptual hashing by normalizing and converting to grayscale.
|
|
26
26
|
|
|
@@ -33,65 +33,36 @@ def _prepare_image(image: Array3D[Any], min_size: int = HASH_SIZE + 1) -> NDArra
|
|
|
33
33
|
|
|
34
34
|
Returns
|
|
35
35
|
-------
|
|
36
|
-
NDArray[np.uint8] | None
|
|
37
|
-
|
|
36
|
+
tuple[NDArray[np.uint8] | None, str | None]
|
|
37
|
+
A tuple of (grayscale_image, warning). On success, warning is None.
|
|
38
|
+
On failure, grayscale_image is None and warning describes the reason.
|
|
38
39
|
"""
|
|
39
40
|
image_np = as_numpy(image)
|
|
40
41
|
|
|
41
42
|
# Perceptual hashing only works on spatial data (2D or higher)
|
|
42
43
|
if image_np.ndim < 2:
|
|
43
|
-
|
|
44
|
-
return None
|
|
44
|
+
return None, "Perceptual hashing requires spatial data (2D or higher dimensions)"
|
|
45
45
|
|
|
46
46
|
# Verify that the image is at least larger than minimum size
|
|
47
47
|
min_dim = min(image_np.shape[-2:])
|
|
48
48
|
if min_dim < min_size:
|
|
49
|
-
|
|
50
|
-
return None
|
|
49
|
+
return None, "Image too small for perceptual hashing"
|
|
51
50
|
|
|
52
51
|
# Normalize the image shape to CxHxW
|
|
53
52
|
normalized = normalize_image_shape(image_np)
|
|
54
53
|
|
|
55
54
|
# Convert to single-channel grayscale image
|
|
56
|
-
return to_canonical_grayscale(normalized)
|
|
55
|
+
return to_canonical_grayscale(normalized), None
|
|
57
56
|
|
|
58
57
|
|
|
59
|
-
def
|
|
60
|
-
"""
|
|
61
|
-
Compute perceptual hash using Discrete Cosine Transform (DCT).
|
|
62
|
-
|
|
63
|
-
Resizes image to a square NxN using Lanczos algorithm where N is 32x32
|
|
64
|
-
or the largest multiple of 8 smaller than input dimensions. The resampled
|
|
65
|
-
image is compressed using DCT and the lowest frequency component is encoded
|
|
66
|
-
as a bit array of greater or less than median value.
|
|
67
|
-
|
|
68
|
-
Parameters
|
|
69
|
-
----------
|
|
70
|
-
image : Array3D
|
|
71
|
-
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
72
|
-
|
|
73
|
-
Returns
|
|
74
|
-
-------
|
|
75
|
-
str
|
|
76
|
-
Hex string hash of the image, or empty string if image is too small
|
|
77
|
-
or not spatial data.
|
|
78
|
-
|
|
79
|
-
Notes
|
|
80
|
-
-----
|
|
81
|
-
DCT-based hashing (pHash) is robust to:
|
|
82
|
-
- Scaling and resizing
|
|
83
|
-
- Minor color adjustments
|
|
84
|
-
- Compression artifacts
|
|
85
|
-
|
|
86
|
-
It captures frequency information, making it effective for detecting
|
|
87
|
-
images that have been resized or slightly modified.
|
|
88
|
-
"""
|
|
58
|
+
def _phash(image: Array3D[Any]) -> tuple[str, str | None]:
|
|
59
|
+
"""Compute perceptual hash, returning (hash, warning) tuple."""
|
|
89
60
|
image_np = as_numpy(image)
|
|
90
61
|
_logger.debug("Computing perceptual hash for image with shape: %s", image_np.shape)
|
|
91
62
|
|
|
92
|
-
grayscale = _prepare_image(image_np)
|
|
63
|
+
grayscale, warning = _prepare_image(image_np)
|
|
93
64
|
if grayscale is None:
|
|
94
|
-
return ""
|
|
65
|
+
return "", warning
|
|
95
66
|
|
|
96
67
|
# Calculates the dimensions of the resized square image
|
|
97
68
|
min_dim = min(image_np.shape[-2:])
|
|
@@ -112,43 +83,17 @@ def phash(image: Array3D[Any]) -> str:
|
|
|
112
83
|
hash_hex = np.packbits(padded).tobytes().hex()
|
|
113
84
|
result = hash_hex if hash_hex else "0"
|
|
114
85
|
_logger.debug("Perceptual hash computed: %s", result[:16] + "..." if len(result) > 16 else result)
|
|
115
|
-
return result
|
|
86
|
+
return result, None
|
|
116
87
|
|
|
117
88
|
|
|
118
|
-
def
|
|
119
|
-
"""
|
|
120
|
-
Compute difference hash (dHash) for an image.
|
|
121
|
-
|
|
122
|
-
Resizes then crops image to 9x8 grayscale and computes horizontal gradient
|
|
123
|
-
by comparing adjacent pixels, producing a 64-bit hash. Captures relative
|
|
124
|
-
brightness changes rather than absolute values.
|
|
125
|
-
|
|
126
|
-
Parameters
|
|
127
|
-
----------
|
|
128
|
-
image : Array3D
|
|
129
|
-
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
130
|
-
|
|
131
|
-
Returns
|
|
132
|
-
-------
|
|
133
|
-
str
|
|
134
|
-
Hex string hash of the image, or empty string if image is too small
|
|
135
|
-
or not spatial data.
|
|
136
|
-
|
|
137
|
-
Notes
|
|
138
|
-
-----
|
|
139
|
-
Difference hash captures gradient information:
|
|
140
|
-
- Captures structural information via pixel transitions
|
|
141
|
-
- Complementary to DCT-based pHash (frequency vs gradient domain)
|
|
142
|
-
|
|
143
|
-
The horizontal gradient approach makes it particularly effective for
|
|
144
|
-
detecting cropped or slightly shifted versions of images.
|
|
145
|
-
"""
|
|
89
|
+
def _dhash(image: Array3D[Any]) -> tuple[str, str | None]:
|
|
90
|
+
"""Compute difference hash, returning (hash, warning) tuple."""
|
|
146
91
|
image_np = as_numpy(image)
|
|
147
92
|
_logger.debug("Computing difference hash for image with shape: %s", image_np.shape)
|
|
148
93
|
|
|
149
|
-
grayscale = _prepare_image(image_np)
|
|
94
|
+
grayscale, warning = _prepare_image(image_np)
|
|
150
95
|
if grayscale is None:
|
|
151
|
-
return ""
|
|
96
|
+
return "", warning
|
|
152
97
|
|
|
153
98
|
# Resize to 9x8 (9 wide to get 8 differences)
|
|
154
99
|
im = resize(grayscale, HASH_SIZE + 1)
|
|
@@ -162,7 +107,7 @@ def dhash(image: Array3D[Any]) -> str:
|
|
|
162
107
|
hash_hex = np.packbits(diff.flatten()).tobytes().hex()
|
|
163
108
|
result = hash_hex if hash_hex else "0"
|
|
164
109
|
_logger.debug("Difference hash computed: %s", result)
|
|
165
|
-
return result
|
|
110
|
+
return result, None
|
|
166
111
|
|
|
167
112
|
|
|
168
113
|
def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
|
|
@@ -191,43 +136,8 @@ def _get_d4_transforms(image: NDArray[np.uint8]) -> list[NDArray[np.uint8]]:
|
|
|
191
136
|
return transforms
|
|
192
137
|
|
|
193
138
|
|
|
194
|
-
def
|
|
195
|
-
"""
|
|
196
|
-
Compute orientation-invariant perceptual hash using DCT.
|
|
197
|
-
|
|
198
|
-
Computes phash for all 8 dihedral group transformations (4 rotations ×
|
|
199
|
-
2 flip states) and returns the lexicographically smallest hash as the
|
|
200
|
-
canonical representative.
|
|
201
|
-
|
|
202
|
-
Parameters
|
|
203
|
-
----------
|
|
204
|
-
image : Array3D
|
|
205
|
-
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
206
|
-
|
|
207
|
-
Returns
|
|
208
|
-
-------
|
|
209
|
-
str
|
|
210
|
-
Canonical hex string hash invariant to rotation and mirroring,
|
|
211
|
-
or empty string if image is too small or not spatial data.
|
|
212
|
-
|
|
213
|
-
Notes
|
|
214
|
-
-----
|
|
215
|
-
This hash is invariant to:
|
|
216
|
-
- 90°, 180°, 270° rotations
|
|
217
|
-
- Horizontal and vertical flips
|
|
218
|
-
- Any combination of rotation and flip
|
|
219
|
-
|
|
220
|
-
The canonical hash is the lexicographically smallest hash among all
|
|
221
|
-
8 orientations, ensuring that any orientation of the same image
|
|
222
|
-
produces the identical hash.
|
|
223
|
-
|
|
224
|
-
Computation cost is ~8x that of regular phash.
|
|
225
|
-
|
|
226
|
-
See Also
|
|
227
|
-
--------
|
|
228
|
-
phash : Standard orientation-sensitive perceptual hash
|
|
229
|
-
dhash_d4 : Orientation-invariant difference hash
|
|
230
|
-
"""
|
|
139
|
+
def _phash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
|
|
140
|
+
"""Compute orientation-invariant perceptual hash, returning (hash, warning) tuple."""
|
|
231
141
|
from scipy.fftpack import dct
|
|
232
142
|
|
|
233
143
|
from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
|
|
@@ -239,11 +149,11 @@ def phash_d4(image: Array3D[Any]) -> str:
|
|
|
239
149
|
|
|
240
150
|
# Validate input
|
|
241
151
|
if image_np.ndim < 2:
|
|
242
|
-
return ""
|
|
152
|
+
return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
|
|
243
153
|
|
|
244
154
|
min_dim = min(image_np.shape[-2:])
|
|
245
155
|
if min_dim < hash_size + 1:
|
|
246
|
-
return ""
|
|
156
|
+
return "", "Image too small for perceptual hashing"
|
|
247
157
|
|
|
248
158
|
# Prepare grayscale image
|
|
249
159
|
normalized = normalize_image_shape(image_np)
|
|
@@ -269,14 +179,133 @@ def phash_d4(image: Array3D[Any]) -> str:
|
|
|
269
179
|
hashes.append(hash_hex if hash_hex else "0")
|
|
270
180
|
|
|
271
181
|
# Return canonical (lexicographically smallest) hash
|
|
272
|
-
return min(hashes)
|
|
182
|
+
return min(hashes), None
|
|
273
183
|
|
|
274
184
|
|
|
275
|
-
def
|
|
185
|
+
def _dhash_d4(image: Array3D[Any]) -> tuple[str, str | None]:
|
|
186
|
+
"""Compute orientation-invariant difference hash, returning (hash, warning) tuple."""
|
|
187
|
+
from dataeval.utils.preprocessing import normalize_image_shape, resize, to_canonical_grayscale
|
|
188
|
+
|
|
189
|
+
hash_size = 8
|
|
190
|
+
|
|
191
|
+
image_np = as_numpy(image)
|
|
192
|
+
|
|
193
|
+
# Validate input
|
|
194
|
+
if image_np.ndim < 2:
|
|
195
|
+
return "", "Perceptual hashing requires spatial data (2D or higher dimensions)"
|
|
196
|
+
|
|
197
|
+
min_dim = min(image_np.shape[-2:])
|
|
198
|
+
if min_dim < hash_size + 1:
|
|
199
|
+
return "", "Image too small for perceptual hashing"
|
|
200
|
+
|
|
201
|
+
# Prepare grayscale image
|
|
202
|
+
normalized = normalize_image_shape(image_np)
|
|
203
|
+
grayscale = to_canonical_grayscale(normalized)
|
|
204
|
+
|
|
205
|
+
# Compute hash for each D4 transformation
|
|
206
|
+
hashes: list[str] = []
|
|
207
|
+
for transformed in _get_d4_transforms(grayscale):
|
|
208
|
+
# Resize to 9x8 (9 wide to get 8 horizontal differences)
|
|
209
|
+
im = resize(transformed, hash_size + 1)
|
|
210
|
+
im = im[:hash_size, : hash_size + 1]
|
|
211
|
+
|
|
212
|
+
# Compute horizontal gradient
|
|
213
|
+
diff = im[:, :-1] > im[:, 1:]
|
|
214
|
+
|
|
215
|
+
# Convert to hex
|
|
216
|
+
hash_hex = np.packbits(diff.flatten()).tobytes().hex()
|
|
217
|
+
hashes.append(hash_hex if hash_hex else "0")
|
|
218
|
+
|
|
219
|
+
# Return canonical (lexicographically smallest) hash
|
|
220
|
+
return min(hashes), None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _xxhash(image: Array3D[Any]) -> tuple[str, str | None]:
|
|
224
|
+
"""Compute xxhash, returning (hash, warning) tuple."""
|
|
225
|
+
image_np = as_numpy(image)
|
|
226
|
+
_logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
|
|
227
|
+
hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
|
|
228
|
+
_logger.debug("xxhash computed: %s", hash_result)
|
|
229
|
+
return hash_result, None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _log_and_return(result: tuple[str, str | None]) -> str:
|
|
233
|
+
"""Log the warning (if any) and return just the hash string."""
|
|
234
|
+
hash_value, warning = result
|
|
235
|
+
if warning:
|
|
236
|
+
_logger.warning(warning)
|
|
237
|
+
return hash_value
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def phash(image: Array3D[Any]) -> str:
|
|
276
241
|
"""
|
|
277
|
-
Compute
|
|
242
|
+
Compute perceptual hash using Discrete Cosine Transform (DCT).
|
|
278
243
|
|
|
279
|
-
|
|
244
|
+
Resizes image to a square NxN using Lanczos algorithm where N is 32x32
|
|
245
|
+
or the largest multiple of 8 smaller than input dimensions. The resampled
|
|
246
|
+
image is compressed using DCT and the lowest frequency component is encoded
|
|
247
|
+
as a bit array of greater or less than median value.
|
|
248
|
+
|
|
249
|
+
Parameters
|
|
250
|
+
----------
|
|
251
|
+
image : Array3D
|
|
252
|
+
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
253
|
+
|
|
254
|
+
Returns
|
|
255
|
+
-------
|
|
256
|
+
str
|
|
257
|
+
Hex string hash of the image, or empty string if image is too small
|
|
258
|
+
or not spatial data.
|
|
259
|
+
|
|
260
|
+
Notes
|
|
261
|
+
-----
|
|
262
|
+
DCT-based hashing (pHash) is robust to:
|
|
263
|
+
- Scaling and resizing
|
|
264
|
+
- Minor color adjustments
|
|
265
|
+
- Compression artifacts
|
|
266
|
+
|
|
267
|
+
It captures frequency information, making it effective for detecting
|
|
268
|
+
images that have been resized or slightly modified.
|
|
269
|
+
"""
|
|
270
|
+
return _log_and_return(_phash(image))
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def dhash(image: Array3D[Any]) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Compute difference hash (dHash) for an image.
|
|
276
|
+
|
|
277
|
+
Resizes then crops image to 9x8 grayscale and computes horizontal gradient
|
|
278
|
+
by comparing adjacent pixels, producing a 64-bit hash. Captures relative
|
|
279
|
+
brightness changes rather than absolute values.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
image : Array3D
|
|
284
|
+
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
285
|
+
|
|
286
|
+
Returns
|
|
287
|
+
-------
|
|
288
|
+
str
|
|
289
|
+
Hex string hash of the image, or empty string if image is too small
|
|
290
|
+
or not spatial data.
|
|
291
|
+
|
|
292
|
+
Notes
|
|
293
|
+
-----
|
|
294
|
+
Difference hash captures gradient information:
|
|
295
|
+
- Captures structural information via pixel transitions
|
|
296
|
+
- Complementary to DCT-based pHash (frequency vs gradient domain)
|
|
297
|
+
|
|
298
|
+
The horizontal gradient approach makes it particularly effective for
|
|
299
|
+
detecting cropped or slightly shifted versions of images.
|
|
300
|
+
"""
|
|
301
|
+
return _log_and_return(_dhash(image))
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def phash_d4(image: Array3D[Any]) -> str:
|
|
305
|
+
"""
|
|
306
|
+
Compute orientation-invariant perceptual hash using DCT.
|
|
307
|
+
|
|
308
|
+
Computes phash for all 8 dihedral group transformations (4 rotations ×
|
|
280
309
|
2 flip states) and returns the lexicographically smallest hash as the
|
|
281
310
|
canonical representative.
|
|
282
311
|
|
|
@@ -302,47 +331,54 @@ def dhash_d4(image: Array3D[Any]) -> str:
|
|
|
302
331
|
8 orientations, ensuring that any orientation of the same image
|
|
303
332
|
produces the identical hash.
|
|
304
333
|
|
|
305
|
-
Computation cost is ~8x that of regular
|
|
334
|
+
Computation cost is ~8x that of regular phash.
|
|
306
335
|
|
|
307
336
|
See Also
|
|
308
337
|
--------
|
|
309
|
-
|
|
310
|
-
|
|
338
|
+
phash : Standard orientation-sensitive perceptual hash
|
|
339
|
+
dhash_d4 : Orientation-invariant difference hash
|
|
311
340
|
"""
|
|
312
|
-
|
|
341
|
+
return _log_and_return(_phash_d4(image))
|
|
313
342
|
|
|
314
|
-
hash_size = 8
|
|
315
343
|
|
|
316
|
-
|
|
344
|
+
def dhash_d4(image: Array3D[Any]) -> str:
|
|
345
|
+
"""
|
|
346
|
+
Compute orientation-invariant difference hash using gradients.
|
|
317
347
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
348
|
+
Computes dhash for all 8 dihedral group transformations (4 rotations ×
|
|
349
|
+
2 flip states) and returns the lexicographically smallest hash as the
|
|
350
|
+
canonical representative.
|
|
321
351
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
352
|
+
Parameters
|
|
353
|
+
----------
|
|
354
|
+
image : Array3D
|
|
355
|
+
An image in CxHxW format. Can be a 3D list, or array-like object.
|
|
325
356
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
str
|
|
360
|
+
Canonical hex string hash invariant to rotation and mirroring,
|
|
361
|
+
or empty string if image is too small or not spatial data.
|
|
329
362
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
363
|
+
Notes
|
|
364
|
+
-----
|
|
365
|
+
This hash is invariant to:
|
|
366
|
+
- 90°, 180°, 270° rotations
|
|
367
|
+
- Horizontal and vertical flips
|
|
368
|
+
- Any combination of rotation and flip
|
|
336
369
|
|
|
337
|
-
|
|
338
|
-
|
|
370
|
+
The canonical hash is the lexicographically smallest hash among all
|
|
371
|
+
8 orientations, ensuring that any orientation of the same image
|
|
372
|
+
produces the identical hash.
|
|
339
373
|
|
|
340
|
-
|
|
341
|
-
hash_hex = np.packbits(diff.flatten()).tobytes().hex()
|
|
342
|
-
hashes.append(hash_hex if hash_hex else "0")
|
|
374
|
+
Computation cost is ~8x that of regular dhash.
|
|
343
375
|
|
|
344
|
-
|
|
345
|
-
|
|
376
|
+
See Also
|
|
377
|
+
--------
|
|
378
|
+
dhash : Standard orientation-sensitive difference hash
|
|
379
|
+
phash_d4 : Orientation-invariant perceptual hash
|
|
380
|
+
"""
|
|
381
|
+
return _log_and_return(_dhash_d4(image))
|
|
346
382
|
|
|
347
383
|
|
|
348
384
|
def xxhash(image: Array3D[Any]) -> str:
|
|
@@ -368,11 +404,7 @@ def xxhash(image: Array3D[Any]) -> str:
|
|
|
368
404
|
hashes, it will produce completely different values for images that
|
|
369
405
|
differ by even a single pixel.
|
|
370
406
|
"""
|
|
371
|
-
|
|
372
|
-
_logger.debug("Computing xxhash for image with shape: %s", image_np.shape)
|
|
373
|
-
hash_result = xxh.xxh3_64_hexdigest(image_np.ravel().tobytes())
|
|
374
|
-
_logger.debug("xxhash computed: %s", hash_result)
|
|
375
|
-
return hash_result
|
|
407
|
+
return _log_and_return(_xxhash(image))
|
|
376
408
|
|
|
377
409
|
|
|
378
410
|
def hamming_distance(hash1: str, hash2: str) -> int:
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
__all__ = []
|
|
4
4
|
|
|
5
|
-
from collections.abc import Sequence
|
|
5
|
+
from collections.abc import Iterator, Sequence
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from typing import Any, Generic, Literal, NamedTuple, TypeVar, overload
|
|
8
8
|
|
|
@@ -75,6 +75,9 @@ class NearDuplicateGroup(Generic[TIndexType]):
|
|
|
75
75
|
methods: frozenset[str]
|
|
76
76
|
orientation: Literal["rotated", "same"] | None = None
|
|
77
77
|
|
|
78
|
+
def __iter__(self) -> Iterator[TIndexType]:
|
|
79
|
+
yield from self.indices
|
|
80
|
+
|
|
78
81
|
def __repr__(self) -> str:
|
|
79
82
|
orientation = f", orientation={self.orientation}" if self.orientation else ""
|
|
80
83
|
return f"NearDuplicateGroup({list(self.indices)}, methods={sorted(self.methods)}{orientation})"
|