dataeval 1.0.3__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {dataeval-1.0.3 → dataeval-1.0.4}/PKG-INFO +1 -1
  2. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_metadata.py +37 -19
  3. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_version.py +2 -2
  4. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/bias/_balance.py +3 -1
  5. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/bias/_diversity.py +1 -1
  6. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_cache.py +11 -2
  7. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_pixelstats.py +14 -2
  8. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_compute_stats.py +28 -7
  9. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/quality/_duplicates.py +58 -26
  10. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/quality/_outliers.py +7 -2
  11. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/preprocessing.py +7 -1
  12. {dataeval-1.0.3 → dataeval-1.0.4}/.gitignore +0 -0
  13. {dataeval-1.0.3 → dataeval-1.0.4}/LICENSE +0 -0
  14. {dataeval-1.0.3 → dataeval-1.0.4}/README.md +0 -0
  15. {dataeval-1.0.3 → dataeval-1.0.4}/pyproject.toml +0 -0
  16. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/__init__.py +0 -0
  17. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_embeddings.py +0 -0
  18. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_experimental.py +0 -0
  19. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_helpers.py +0 -0
  20. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_log.py +0 -0
  21. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/_warm_cache.py +0 -0
  22. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/bias/__init__.py +0 -0
  23. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/bias/_parity.py +0 -0
  24. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/config.py +0 -0
  25. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/__init__.py +0 -0
  26. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_ber.py +0 -0
  27. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_bin.py +0 -0
  28. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/__init__.py +0 -0
  29. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_base.py +0 -0
  30. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  31. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  32. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_register.py +0 -0
  33. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_registry.py +0 -0
  34. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  35. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_clusterer.py +0 -0
  36. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_completeness.py +0 -0
  37. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_compute_ratios.py +0 -0
  38. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_coverage.py +0 -0
  39. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_divergence.py +0 -0
  40. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_diversity.py +0 -0
  41. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
  42. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  43. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_fast_hdbscan/_mst.py +0 -0
  44. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_feature_distance.py +0 -0
  45. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_hash.py +0 -0
  46. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_label_errors.py +0 -0
  47. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_label_parity.py +0 -0
  48. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_label_stats.py +0 -0
  49. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_metadata_insights.py +0 -0
  50. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_mst.py +0 -0
  51. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_mutual_info.py +0 -0
  52. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_nullmodel.py +0 -0
  53. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_parity.py +0 -0
  54. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_rank.py +0 -0
  55. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/core/_uap.py +0 -0
  56. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/exceptions.py +0 -0
  57. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/__init__.py +0 -0
  58. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/_bovw.py +0 -0
  59. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/_flatten.py +0 -0
  60. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/_onnx.py +0 -0
  61. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/_torch.py +0 -0
  62. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/extractors/_uncertainty.py +0 -0
  63. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/flags.py +0 -0
  64. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/performance/__init__.py +0 -0
  65. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/performance/_aggregator.py +0 -0
  66. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/performance/_output.py +0 -0
  67. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/performance/_sufficiency.py +0 -0
  68. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/performance/schedules.py +0 -0
  69. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/protocols.py +0 -0
  70. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/py.typed +0 -0
  71. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/quality/__init__.py +0 -0
  72. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/quality/_shared.py +0 -0
  73. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/scope/__init__.py +0 -0
  74. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/scope/_prioritize.py +0 -0
  75. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/__init__.py +0 -0
  76. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_classbalance.py +0 -0
  77. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_classfilter.py +0 -0
  78. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_indices.py +0 -0
  79. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_limit.py +0 -0
  80. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_reverse.py +0 -0
  81. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_select.py +0 -0
  82. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/selection/_shuffle.py +0 -0
  83. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/__init__.py +0 -0
  84. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/__init__.py +0 -0
  85. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_base.py +0 -0
  86. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_chunk.py +0 -0
  87. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
  88. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
  89. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_mmd.py +0 -0
  90. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
  91. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_drift/_univariate.py +0 -0
  92. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_ood/__init__.py +0 -0
  93. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_ood/_base.py +0 -0
  94. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_ood/_domain_classifier.py +0 -0
  95. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  96. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  97. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_shared/__init__.py +0 -0
  98. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  99. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  100. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/_shared/_reconstruction.py +0 -0
  101. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/shift/update_strategies.py +0 -0
  102. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/types.py +0 -0
  103. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/__init__.py +0 -0
  104. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/_internal.py +0 -0
  105. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/data.py +0 -0
  106. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/losses.py +0 -0
  107. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/models.py +0 -0
  108. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/onnx.py +0 -0
  109. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/thresholds.py +0 -0
  110. {dataeval-1.0.3 → dataeval-1.0.4}/src/dataeval/utils/training.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -39,6 +39,7 @@ class FactorInfo:
39
39
  factor_type: Literal["categorical", "continuous", "discrete"]
40
40
  is_binned: bool = False
41
41
  is_digitized: bool = False
42
+ level: Literal["image", "target"] = "image"
42
43
 
43
44
 
44
45
  def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
@@ -110,8 +111,8 @@ class Metadata(Array, FeatureExtractor):
110
111
  *,
111
112
  continuous_factor_bins: Mapping[str, int | Sequence[float]] | None = None,
112
113
  auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
113
- exclude: Sequence[str] | None = None,
114
- include: Sequence[str] | None = None,
114
+ exclude: str | Sequence[str] | None = None,
115
+ include: str | Sequence[str] | None = None,
115
116
  ) -> None:
116
117
  self._class_labels: NDArray[np.intp]
117
118
  self._item_indices: NDArray[np.intp]
@@ -132,8 +133,8 @@ class Metadata(Array, FeatureExtractor):
132
133
  if exclude is not None and include is not None:
133
134
  raise ValueError("Filters for `exclude` and `include` are mutually exclusive.")
134
135
 
135
- self._exclude = set(exclude or ())
136
- self._include = set(include or ())
136
+ self._exclude = {exclude} if isinstance(exclude, str) else set(exclude or ())
137
+ self._include = {include} if isinstance(include, str) else set(include or ())
137
138
  self._target_factors_only = False
138
139
 
139
140
  def __repr__(self) -> str:
@@ -423,17 +424,17 @@ class Metadata(Array, FeatureExtractor):
423
424
  return self._exclude
424
425
 
425
426
  @exclude.setter
426
- def exclude(self, value: Sequence[str]) -> None:
427
+ def exclude(self, value: str | Sequence[str]) -> None:
427
428
  """Set factor names to exclude from processing.
428
429
 
429
430
  Automatically clears include filter and resets binning state when exclusion list changes.
430
431
 
431
432
  Parameters
432
433
  ----------
433
- value : Sequence[str]
434
- Factor names to exclude from metadata analysis.
434
+ value : str | Sequence[str]
435
+ Factor name or names to exclude from metadata analysis.
435
436
  """
436
- exclude = set(value)
437
+ exclude = {value} if isinstance(value, str) else set(value)
437
438
  if self._exclude != exclude:
438
439
  self._exclude = exclude
439
440
  self._include = set()
@@ -451,7 +452,7 @@ class Metadata(Array, FeatureExtractor):
451
452
  return self._include
452
453
 
453
454
  @include.setter
454
- def include(self, value: Sequence[str]) -> None:
455
+ def include(self, value: str | Sequence[str]) -> None:
455
456
  """Set factor names to include in processing.
456
457
 
457
458
  Automatically clears exclude filter and resets binning state when
@@ -459,10 +460,10 @@ class Metadata(Array, FeatureExtractor):
459
460
 
460
461
  Parameters
461
462
  ----------
462
- value : Sequence[str]
463
- Factor names to include in metadata analysis.
463
+ value : str | Sequence[str]
464
+ Factor name or names to include in metadata analysis.
464
465
  """
465
- include = set(value)
466
+ include = {value} if isinstance(value, str) else set(value)
466
467
  if self._include != include:
467
468
  self._include = include
468
469
  self._exclude = set()
@@ -1115,13 +1116,26 @@ class Metadata(Array, FeatureExtractor):
1115
1116
  raise ValueError(f"Invalid level: {level}. Must be 'image', 'target', or 'auto'")
1116
1117
 
1117
1118
  def _create_factor_column(self, data_array: NDArray, level: str, num_image_rows: int) -> list:
1118
- """Create a factor column with values at the appropriate level."""
1119
+ """Create a factor column with values at the appropriate level.
1120
+
1121
+ For OD datasets with image-level factors, values are stored in image rows
1122
+ and replicated to target rows using item_index mapping, so that bias
1123
+ evaluators can access them via target_data.
1124
+ """
1119
1125
  if level == "image":
1120
- # Create column: image-level values in image rows, None in target rows
1121
- full_data = [None] * len(self.dataframe)
1122
- for idx, val in enumerate(data_array):
1123
- full_data[idx] = val # Image rows come first in our structure
1124
- return full_data
1126
+ # Image rows get the values directly
1127
+ image_values: list = data_array.tolist()
1128
+
1129
+ if self.has_targets():
1130
+ # For OD datasets, replicate image-level values to target rows
1131
+ # using the item_index column which maps each target to its source image
1132
+ target_df = self._dataframe.filter(pl.col("target_index").is_not_null())
1133
+ target_image_indices = target_df["item_index"].to_numpy()
1134
+ target_values = data_array[target_image_indices].tolist()
1135
+ else:
1136
+ target_values = []
1137
+
1138
+ return image_values + target_values
1125
1139
  # level == "target"
1126
1140
  # Create column: None in image rows, target-level values in target rows
1127
1141
  return [None] * num_image_rows + list(data_array)
@@ -1267,7 +1281,8 @@ class Metadata(Array, FeatureExtractor):
1267
1281
  k for k in factors if not isinstance(self._dataframe.schema.get(k), pl.List | pl.Struct | pl.Array)
1268
1282
  }
1269
1283
 
1270
- self._factors = dict.fromkeys(usable_factors, None)
1284
+ existing = self._factors if hasattr(self, "_factors") else {}
1285
+ self._factors = {k: existing.get(k) for k in usable_factors}
1271
1286
 
1272
1287
  def _structure(
1273
1288
  self,
@@ -1492,9 +1507,12 @@ class Metadata(Array, FeatureExtractor):
1492
1507
  factors_to_process = [col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set]
1493
1508
  total_factors = len(factors_to_process)
1494
1509
 
1510
+ target_only = self._target_factors - self._image_factors if is_od else set()
1495
1511
  for i, col in enumerate(factors_to_process):
1496
1512
  data = data_df[col].to_numpy()
1497
1513
  df, info = self._process_factor(df, col, data, factor_bins, is_od)
1514
+ if is_od and col in target_only:
1515
+ info.level = "target"
1498
1516
  factor_info[col] = info
1499
1517
 
1500
1518
  if progress_callback:
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '1.0.3'
32
- __version_tuple__ = version_tuple = (1, 0, 3)
31
+ __version__ = version = '1.0.4'
32
+ __version_tuple__ = version_tuple = (1, 0, 4)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -3,6 +3,7 @@ __all__ = []
3
3
  from dataclasses import dataclass
4
4
  from typing import Any, Literal
5
5
 
6
+ import numpy as np
6
7
  import polars as pl
7
8
 
8
9
  from dataeval import Metadata
@@ -269,8 +270,9 @@ class Balance(Evaluator):
269
270
  # Include class_label as the first factor (index 0), then all metadata factors
270
271
  all_factor_names = ["class_label"] + factor_names
271
272
 
273
+ u_classes = np.unique(self.metadata.class_labels)
272
274
  for class_idx in range(classwise.shape[0]):
273
- class_name = index2label.get(class_idx, str(class_idx))
275
+ class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
274
276
  for factor_idx in range(classwise.shape[1]):
275
277
  mi_value = classwise[class_idx, factor_idx]
276
278
  class_name_col.append(class_name)
@@ -251,7 +251,7 @@ class Diversity(Evaluator):
251
251
  is_low_diversity_col: list[bool] = []
252
252
 
253
253
  for class_idx in range(classwise_div.shape[0]):
254
- class_name = index2label.get(class_idx, str(class_idx))
254
+ class_name = index2label.get(int(u_classes[class_idx]), str(u_classes[class_idx]))
255
255
  for factor_idx in range(num_factors):
256
256
  div_value = classwise_div[class_idx, factor_idx]
257
257
  if not np.isnan(div_value):
@@ -22,7 +22,13 @@ class CalculatorCache:
22
22
  This class adapts based on the data type passed in.
23
23
  """
24
24
 
25
- def __init__(self, datum: Any, box: BoundingBox | None = None, per_channel: bool = False) -> None:
25
+ def __init__(
26
+ self,
27
+ datum: Any,
28
+ box: BoundingBox | None = None,
29
+ per_channel: bool = False,
30
+ normalize_pixel_values: bool = False,
31
+ ) -> None:
26
32
  is_spatial = len(datum.shape) >= 2
27
33
  self.raw = datum
28
34
  # Assume image data for now (will be generic in future)
@@ -30,6 +36,7 @@ class CalculatorCache:
30
36
  self.height: int = datum.shape[-2] if is_spatial else 0
31
37
  self.shape: tuple[int, ...] = datum.shape
32
38
  self.per_channel_mode = per_channel
39
+ self.normalize_pixel_values = normalize_pixel_values
33
40
  self.has_box = box is not None
34
41
 
35
42
  # Ensure bounding box
@@ -54,7 +61,9 @@ class CalculatorCache:
54
61
 
55
62
  @cached_property
56
63
  def scaled(self) -> NDArray[Any]:
57
- return rescale(self.image)
64
+ if self.normalize_pixel_values:
65
+ return rescale(self.image)
66
+ return self.image
58
67
 
59
68
  @cached_property
60
69
  def per_channel(self) -> NDArray[Any]:
@@ -39,11 +39,23 @@ class PixelStatCalculator(Calculator[ImageStats]):
39
39
  def _var_func(self, data: NDArray[Any], **kw: Any) -> Any:
40
40
  return np.nanvar(data, **kw) if self._has_nan else np.var(data, **kw)
41
41
 
42
+ @cached_property
43
+ def _histogram_range(self) -> tuple[float, float]:
44
+ if self.cache.normalize_pixel_values:
45
+ return (0.0, 1.0)
46
+ from dataeval.utils.preprocessing import get_bitdepth
47
+
48
+ bitdepth = get_bitdepth(self.cache.scaled)
49
+ if bitdepth.depth == 0:
50
+ return (0.0, 1.0)
51
+ return (0.0, float(bitdepth.pmax))
52
+
42
53
  @cached_property
43
54
  def histogram(self) -> NDArray[np.float64]:
55
+ r = self._histogram_range
44
56
  if self.per_channel_mode:
45
- return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=(0, 1))[0], 1, self.cache.per_channel)
46
- return np.histogram(self.cache.scaled, bins=256, range=(0, 1))[0]
57
+ return np.apply_along_axis(lambda y: np.histogram(y, bins=256, range=r)[0], 1, self.cache.per_channel)
58
+ return np.histogram(self.cache.scaled, bins=256, range=r)[0]
47
59
 
48
60
  def get_applicable_flags(self) -> ImageStats:
49
61
  """Return which flags this calculator handles."""
@@ -1,6 +1,7 @@
1
1
  __all__ = []
2
2
 
3
3
  import logging
4
+ import warnings
4
5
  from collections.abc import Iterable, Iterator, Mapping, Sequence, Sized
5
6
  from dataclasses import dataclass
6
7
  from enum import Flag
@@ -103,6 +104,7 @@ def _collect_calculator_stats(
103
104
  datum: NDArray[Any],
104
105
  box: BoundingBox | None,
105
106
  per_channel: bool,
107
+ normalize_pixel_values: bool = False,
106
108
  ) -> tuple[list[dict[str, list[Any]]], dict[str, Any], list[str]]:
107
109
  """
108
110
  Collect stats from all calculators.
@@ -118,7 +120,7 @@ def _collect_calculator_stats(
118
120
  stats_list = []
119
121
  empty_values_map: dict[str, Any] = {}
120
122
  warnings: list[str] = []
121
- processor = CalculatorCache(datum, box, per_channel)
123
+ processor = CalculatorCache(datum, box, per_channel, normalize_pixel_values=normalize_pixel_values)
122
124
  for calculator_cls, flags in calculators:
123
125
  calculator = calculator_cls(datum, processor, per_channel)
124
126
  stats_list.append(calculator.compute(flags))
@@ -224,6 +226,7 @@ def _compute_batch(
224
226
  per_image: bool,
225
227
  per_target: bool,
226
228
  per_channel: bool,
229
+ normalize_pixel_values: bool = False,
227
230
  ) -> DatumBatchResult:
228
231
  i, datum, boxes = args
229
232
  results: list[DatumResult] = []
@@ -248,7 +251,7 @@ def _compute_batch(
248
251
 
249
252
  # Collect stats from all calculators
250
253
  calculator_stats, empty_values_map, calc_warnings = _collect_calculator_stats(
251
- calculators, datum, box, per_channel
254
+ calculators, datum, box, per_channel, normalize_pixel_values=normalize_pixel_values
252
255
  )
253
256
 
254
257
  # Thread calculator warnings with index context
@@ -338,6 +341,9 @@ def _aggregate_batch(
338
341
  warning_list.extend(result.warnings_list)
339
342
 
340
343
 
344
+ _UNSET = object()
345
+
346
+
341
347
  def compute_stats(
342
348
  data: Iterable[ArrayLike] | Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]],
343
349
  *,
@@ -346,16 +352,12 @@ def compute_stats(
346
352
  per_image: bool = True,
347
353
  per_target: bool = True,
348
354
  per_channel: bool = False,
355
+ normalize_pixel_values: bool = _UNSET, # type: ignore
349
356
  progress_callback: ProgressCallback | None = None,
350
357
  ) -> StatsResult:
351
358
  """
352
359
  Compute specified statistics on a set of images, optionally within bounding boxes.
353
360
 
354
- Mixed-bit-depth datasets can produce misleading statistics when raw pixel values are
355
- compared directly. To avoid this, pixel values are normalized to [0, 1] based on each
356
- image's bit depth before any statistic is computed, keeping results meaningful and
357
- comparable across 8-bit, 16-bit, 32-bit, and other precision images.
358
-
359
361
  Parameters
360
362
  ----------
361
363
  data : Iterable[ArrayLike] | Dataset[ArrayLike] | Dataset[tuple[ArrayLike, Any, Any]]
@@ -376,6 +378,15 @@ def compute_stats(
376
378
  per_channel : bool, default False
377
379
  If True, compute per-channel statistics. If False, statistics are
378
380
  aggregated across all channels.
381
+ normalize_pixel_values : bool, default True
382
+ If True, pixel values are normalized to [0, 1] based on each image's
383
+ inferred bit depth before any statistic is computed. This makes results
384
+ comparable across images with different bit depths (8-bit, 16-bit, etc.).
385
+ If False, statistics are computed on raw pixel values.
386
+
387
+ .. deprecated::
388
+ The default will change to False in v1.1. Pass explicitly to silence
389
+ the deprecation warning.
379
390
  progress_callback : ProgressCallback or None, default None
380
391
  Callback to report progress during calculation. Called after each image is processed
381
392
  with the current image count and total number of images (if known).
@@ -422,6 +433,15 @@ def compute_stats(
422
433
 
423
434
  >>> stats = compute_stats(images, boxes=boxes, per_image=True, per_target=True, per_channel=True)
424
435
  """
436
+ if normalize_pixel_values is _UNSET:
437
+ warnings.warn(
438
+ "The default value of normalize_pixel_values will change from True to False in v1.1. "
439
+ "Pass normalize_pixel_values explicitly to silence this warning.",
440
+ FutureWarning,
441
+ stacklevel=2,
442
+ )
443
+ normalize_pixel_values = True
444
+
425
445
  source_indices: list[SourceIndex] = []
426
446
  aggregated_stats: dict[str, list[Any]] = {}
427
447
  object_count: dict[int, int] = {}
@@ -484,6 +504,7 @@ def compute_stats(
484
504
  per_image=per_image,
485
505
  per_target=per_target,
486
506
  per_channel=per_channel,
507
+ normalize_pixel_values=normalize_pixel_values,
487
508
  ),
488
509
  _enumerate_datum(images, boxes),
489
510
  ):
@@ -2,6 +2,7 @@
2
2
 
3
3
  __all__ = []
4
4
 
5
+ import warnings
5
6
  from collections.abc import Mapping, Sequence
6
7
  from typing import Any, Generic, Literal, TypeVar, overload
7
8
 
@@ -246,10 +247,10 @@ def _group_by_dataset(row: Mapping[str, Any], has_targets: bool) -> dict[int, li
246
247
  """Group a row's members by dataset index."""
247
248
  by_ds: dict[int, list[Any]] = {}
248
249
  if has_targets:
249
- for item, target, ds in zip(row["item_indices"], row["target_indices"], row["dataset_index"], strict=True):
250
+ for item, target, ds in zip(row["item_indices"], row["target_indices"], row["dataset_indices"], strict=True):
250
251
  by_ds.setdefault(ds, []).append(SourceIndex(item=item, target=target))
251
252
  else:
252
- for item, ds in zip(row["item_indices"], row["dataset_index"], strict=True):
253
+ for item, ds in zip(row["item_indices"], row["dataset_indices"], strict=True):
253
254
  by_ds.setdefault(ds, []).append(item)
254
255
  return by_ds
255
256
 
@@ -323,11 +324,11 @@ def _make_row(
323
324
  "dup_type": dup_type,
324
325
  "item_indices": item_ids,
325
326
  "target_indices": target_ids,
326
- "methods": methods,
327
- "orientation": orientation,
328
327
  }
329
328
  if ds_ids is not None:
330
- row["dataset_index"] = ds_ids
329
+ row["dataset_indices"] = ds_ids
330
+ row["methods"] = methods
331
+ row["orientation"] = orientation
331
332
  return row
332
333
 
333
334
 
@@ -492,7 +493,7 @@ class DuplicatesOutput(DataFrameOutput, Generic[TExactDuplicatesGroup, TNearDupl
492
493
  - methods: list[str] - Detection method names (e.g., ``["phash", "dhash"]``)
493
494
  - orientation: str | None - ``"same"``, ``"rotated"``, or None (only present
494
495
  when both basic and D4 hashes were computed)
495
- - dataset_index: list[int] - Dataset indices for cross-dataset results (only
496
+ - dataset_indices: list[int] - Dataset indices for cross-dataset results (only
496
497
  present for multi-dataset output, positionally aligned with item_indices)
497
498
 
498
499
  Attributes
@@ -529,6 +530,19 @@ class DuplicatesOutput(DataFrameOutput, Generic[TExactDuplicatesGroup, TNearDupl
529
530
  self.merge_near_duplicates = merge_near_duplicates
530
531
  self.flags = flags
531
532
 
533
+ _COLUMN_ALIASES = {"dataset_index": "dataset_indices"}
534
+
535
+ def __getitem__(self, item: Any) -> Any:
536
+ if isinstance(item, str) and item in self._COLUMN_ALIASES:
537
+ new_name = self._COLUMN_ALIASES[item]
538
+ warnings.warn(
539
+ f"Column '{item}' was renamed to '{new_name}'. Access via '{item}' will be removed in v1.1.",
540
+ DeprecationWarning,
541
+ stacklevel=2,
542
+ )
543
+ item = new_name
544
+ return self.data()[item]
545
+
532
546
  def __len__(self) -> int:
533
547
  """Return the number of duplicate groups."""
534
548
  return self.data().shape[0]
@@ -569,7 +583,7 @@ class DuplicatesOutput(DataFrameOutput, Generic[TExactDuplicatesGroup, TNearDupl
569
583
  - Single-dataset with targets: ``list[tuple[list[SourceIndex], list[str]]]``
570
584
  - Cross-dataset: wraps the above in a ``dict`` keyed by dataset index.
571
585
  """
572
- is_cross = "dataset_index" in self.data().columns
586
+ is_cross = "dataset_indices" in self.data().columns
573
587
  has_targets = "target_indices" in self.data().columns
574
588
  is_near = dup_type == "near"
575
589
 
@@ -654,7 +668,7 @@ class DuplicatesOutput(DataFrameOutput, Generic[TExactDuplicatesGroup, TNearDupl
654
668
  - dup_types: list[str] - Unique duplicate types for this image
655
669
  - methods: list[str] - All unique methods that detected this image
656
670
  """
657
- if "dataset_index" in self.data().columns:
671
+ if "dataset_indices" in self.data().columns:
658
672
  raise ValueError("aggregate_by_image only works with output from a single dataset.")
659
673
 
660
674
  schema: Any = {
@@ -1074,7 +1088,7 @@ class Duplicates(Evaluator):
1074
1088
  -------
1075
1089
  DuplicatesOutput
1076
1090
  Duplicate detection results as a DataFrame of duplicate groups.
1077
- For cross-dataset detection, includes a dataset_index column.
1091
+ For cross-dataset detection, includes a dataset_indices column.
1078
1092
 
1079
1093
  See Also
1080
1094
  --------
@@ -1232,7 +1246,7 @@ class Duplicates(Evaluator):
1232
1246
  -------
1233
1247
  SingleDuplicatesOutput or MultiDuplicatesOutput
1234
1248
  Duplicate detection results as a DataFrame of duplicate groups.
1235
- For multi-dataset input, includes a ``dataset_index`` column.
1249
+ For multi-dataset input, includes a ``dataset_indices`` column.
1236
1250
 
1237
1251
  Raises
1238
1252
  ------
@@ -1244,24 +1258,32 @@ class Duplicates(Evaluator):
1244
1258
  Hash-based duplicates with merged near duplicates (default):
1245
1259
 
1246
1260
  >>> detector = Duplicates()
1247
- >>> result = detector.evaluate(images)
1248
- >>> result
1249
- shape: (4, 5)
1250
- ┌──────────┬───────┬──────────┬───────────────┬────────────────────┐
1251
- group_id level dup_type item_indices methods
1252
- ------------ ---
1253
- │ i64 ┆ str ┆ str ┆ list[i64] ┆ list[str] │
1254
- ╞══════════╪═══════╪══════════╪═══════════════╪════════════════════╡
1255
- 0 ┆ item ┆ exact ┆ [3, 20] ┆ ["xxhash"]
1256
- 1 ┆ item ┆ exact ┆ [7, 11, … 25] ┆ ["xxhash"]
1257
- │ 2 ┆ item ┆ exact ┆ [16, 37] ┆ ["xxhash"] │
1258
- │ 3 ┆ item ┆ near ┆ [0, 1, … 49] ┆ ["dhash", "phash"] │
1259
- └──────────┴───────┴──────────┴───────────────┴────────────────────┘
1261
+ >>> detector.evaluate(images)
1262
+ shape: (3, 5)
1263
+ ┌──────────┬───────┬──────────┬───────────────┬────────────┐
1264
+ │ group_id ┆ level ┆ dup_type ┆ item_indices ┆ methods │
1265
+ --- --- --- --- ---
1266
+ i64strstrlist[i64] list[str]
1267
+ ╞══════════╪═══════╪══════════╪═══════════════╪════════════╡
1268
+ │ 0 ┆ item ┆ exact ┆ [3, 20] ┆ ["xxhash"] │
1269
+ 1 ┆ item ┆ exact ┆ [7, 11, … 25] ┆ ["xxhash"]
1270
+ 2 ┆ item ┆ exact ┆ [16, 37] ┆ ["xxhash"]
1271
+ └──────────┴───────┴──────────┴───────────────┴────────────┘
1260
1272
 
1261
1273
  Cross-dataset detection:
1262
1274
 
1263
1275
  >>> detector = Duplicates()
1264
- >>> result = detector.evaluate(train_ds, test_ds)
1276
+ >>> detector.evaluate(train_ds, test_ds)
1277
+ shape: (3, 6)
1278
+ ┌──────────┬───────┬──────────┬───────────────┬─────────────────┬────────────┐
1279
+ │ group_id ┆ level ┆ dup_type ┆ item_indices ┆ dataset_indices ┆ methods │
1280
+ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1281
+ │ i64 ┆ str ┆ str ┆ list[i64] ┆ list[i64] ┆ list[str] │
1282
+ ╞══════════╪═══════╪══════════╪═══════════════╪═════════════════╪════════════╡
1283
+ │ 0 ┆ item ┆ exact ┆ [3, 20] ┆ [0, 0] ┆ ["xxhash"] │
1284
+ │ 1 ┆ item ┆ exact ┆ [7, 11, … 25] ┆ [0, 0, … 0] ┆ ["xxhash"] │
1285
+ │ 2 ┆ item ┆ exact ┆ [16, 37] ┆ [0, 0] ┆ ["xxhash"] │
1286
+ └──────────┴───────┴──────────┴───────────────┴─────────────────┴────────────┘
1265
1287
  """
1266
1288
  if other:
1267
1289
  return self._evaluate_multi([data, *other], per_image=per_image, per_target=per_target)
@@ -1296,7 +1318,11 @@ class Duplicates(Evaluator):
1296
1318
  # Hash-based duplicate detection
1297
1319
  if self.flags & ImageStats.HASH:
1298
1320
  self.stats = compute_stats(
1299
- data, stats=self.flags & ImageStats.HASH, per_image=per_image, per_target=per_target
1321
+ data,
1322
+ stats=self.flags & ImageStats.HASH,
1323
+ per_image=per_image,
1324
+ per_target=per_target,
1325
+ normalize_pixel_values=False,
1300
1326
  )
1301
1327
  (item_exact, item_near), (target_exact, target_near) = _detect_hash_duplicates(
1302
1328
  self.stats["stats"], self.stats["source_index"]
@@ -1358,7 +1384,13 @@ class Duplicates(Evaluator):
1358
1384
  calc_results: list[StatsResult] = []
1359
1385
  if has_hash_detection:
1360
1386
  calc_results = [
1361
- compute_stats(ds, stats=self.flags & ImageStats.HASH, per_image=per_image, per_target=per_target)
1387
+ compute_stats(
1388
+ ds,
1389
+ stats=self.flags & ImageStats.HASH,
1390
+ per_image=per_image,
1391
+ per_target=per_target,
1392
+ normalize_pixel_values=False,
1393
+ )
1362
1394
  for ds in datasets
1363
1395
  ]
1364
1396
  self.stats = calc_results[-1]
@@ -1623,7 +1623,9 @@ class Outliers(Evaluator):
1623
1623
  stored_cluster_stats: ClusterStats | None = None
1624
1624
 
1625
1625
  if self.flags != ImageStats.NONE:
1626
- self.stats = compute_stats(data, stats=self.flags, per_image=per_image, per_target=per_target)
1626
+ self.stats = compute_stats(
1627
+ data, stats=self.flags, per_image=per_image, per_target=per_target, normalize_pixel_values=True
1628
+ )
1627
1629
  stats_result = self.stats
1628
1630
 
1629
1631
  class_ids: NDArray[np.intp] | None = None
@@ -1674,7 +1676,10 @@ class Outliers(Evaluator):
1674
1676
  stats_results: list[StatsResult] = []
1675
1677
  if self.flags != ImageStats.NONE:
1676
1678
  stats_results = [
1677
- compute_stats(ds, stats=self.flags, per_image=per_image, per_target=per_target) for ds in datasets
1679
+ compute_stats(
1680
+ ds, stats=self.flags, per_image=per_image, per_target=per_target, normalize_pixel_values=True
1681
+ )
1682
+ for ds in datasets
1678
1683
  ]
1679
1684
  self.stats = stats_results[-1]
1680
1685
 
@@ -413,7 +413,7 @@ def rescale(image: NDArray[Any], depth: int = 1) -> NDArray[Any]:
413
413
  bitdepth = get_bitdepth(image)
414
414
  if bitdepth.depth == depth:
415
415
  return image
416
- normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
416
+ normalized = (image - bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
417
417
  return normalized * (2**depth - 1)
418
418
 
419
419
 
@@ -554,6 +554,12 @@ def to_canonical_grayscale(image: NDArray[Any]) -> NDArray[np.uint8]:
554
554
  NDArray[np.uint8]
555
555
  2D grayscale array (HW) of type np.uint8
556
556
  """
557
+ # Rescale normalized [0, 1] float images to [0, 255] range
558
+ if np.issubdtype(image.dtype, np.floating) and image.size > 0:
559
+ pmin, pmax = np.nanmin(image), np.nanmax(image)
560
+ if pmax <= 1.0 and pmin >= 0.0:
561
+ image = image * 255.0
562
+
557
563
  channels = image.shape[0]
558
564
 
559
565
  # --- Case 1: Single Channel (Already Grayscale) ---
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes