dataeval 0.86.5__py3-none-any.whl → 0.86.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataeval/__init__.py CHANGED
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.86.5"
11
+ __version__ = "0.86.7"
12
12
 
13
13
  import logging
14
14
 
@@ -234,14 +234,17 @@ class Metadata:
234
234
  if is_od_target := isinstance(target, ObjectDetectionTarget):
235
235
  target_labels = as_numpy(target.labels)
236
236
  target_len = len(target_labels)
237
- labels.extend(target_labels.tolist())
238
- bboxes.extend(as_numpy(target.boxes).tolist())
239
- scores.extend(as_numpy(target.scores).tolist())
240
- srcidx.extend([i] * target_len)
237
+ if target_len:
238
+ labels.extend(target_labels.tolist())
239
+ bboxes.extend(as_numpy(target.boxes).tolist())
240
+ scores.extend(as_numpy(target.scores).tolist())
241
+ srcidx.extend([i] * target_len)
241
242
  elif isinstance(target, Array):
242
- target_len = 1
243
- labels.append(int(np.argmax(as_numpy(target))))
244
- scores.append(target)
243
+ if len(target):
244
+ target_len = 1
245
+ labels.append(int(np.argmax(as_numpy(target))))
246
+ scores.append(target)
247
+ srcidx.append(i)
245
248
  else:
246
249
  raise TypeError("Encountered unsupported target type in dataset")
247
250
 
@@ -252,18 +255,18 @@ class Metadata:
252
255
  labels = as_numpy(labels).astype(np.intp)
253
256
  scores = as_numpy(scores).astype(np.float32)
254
257
  bboxes = as_numpy(bboxes).astype(np.float32) if is_od else None
255
- srcidx = as_numpy(srcidx).astype(np.intp) if is_od else None
258
+ srcidx = as_numpy(srcidx).astype(np.intp)
256
259
 
257
260
  index2label = self._dataset.metadata.get("index2label", {i: str(i) for i in np.unique(labels)})
258
261
 
259
- targets_per_image = None if srcidx is None else np.unique(srcidx, return_counts=True)[1].tolist()
262
+ targets_per_image = np.bincount(srcidx, minlength=len(self._dataset)).tolist() if is_od else None
260
263
  merged = merge(raw, return_dropped=True, ignore_lists=False, targets_per_image=targets_per_image)
261
264
 
262
265
  reserved = ["image_index", "class_label", "score", "box"]
263
266
  factor_dict = {f"metadata_{k}" if k in reserved else k: v for k, v in merged[0].items() if k != "_image_index"}
264
267
 
265
268
  target_dict = {
266
- "image_index": srcidx if srcidx is not None else np.arange(len(labels)),
269
+ "image_index": srcidx,
267
270
  "class_label": labels,
268
271
  "score": scores,
269
272
  "box": bboxes if bboxes is not None else [None] * len(labels),
@@ -18,57 +18,59 @@ from dataeval.outputs._stats import BASE_ATTRS
18
18
  from dataeval.typing import ArrayLike, Dataset
19
19
 
20
20
 
21
+ def _get_zscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
22
+ threshold = threshold if threshold is not None else 3.0
23
+ std_val = np.nanstd(values)
24
+ if std_val > EPSILON:
25
+ mean_val = np.nanmean(values)
26
+ abs_diff = np.abs(values - mean_val)
27
+ return (abs_diff / std_val) > threshold
28
+ return None
29
+
30
+
31
+ def _get_modzscore_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
32
+ threshold = threshold if threshold is not None else 3.5
33
+ median_val = np.nanmedian(values)
34
+ abs_diff = np.abs(values - median_val)
35
+ m_abs_diff = np.nanmedian(abs_diff)
36
+ m_abs_diff = np.nanmean(abs_diff) if m_abs_diff <= EPSILON else m_abs_diff
37
+ if m_abs_diff > EPSILON:
38
+ mod_z_score = 0.6745 * abs_diff / m_abs_diff
39
+ return mod_z_score > threshold
40
+ return None
41
+
42
+
43
+ def _get_iqr_mask(values: NDArray[np.float64], threshold: float | None) -> NDArray[np.bool_] | None:
44
+ threshold = threshold if threshold is not None else 1.5
45
+ qrt = np.nanpercentile(values, q=(25, 75), method="midpoint")
46
+ iqr_val = qrt[1] - qrt[0]
47
+ if iqr_val > EPSILON:
48
+ iqr_threshold = iqr_val * threshold
49
+ return (values < (qrt[0] - iqr_threshold)) | (values > (qrt[1] + iqr_threshold))
50
+ return None
51
+
52
+
21
53
  def _get_outlier_mask(
22
54
  values: NDArray[Any], method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
23
55
  ) -> NDArray[np.bool_]:
24
56
  if len(values) == 0:
25
57
  return np.array([], dtype=bool)
26
58
 
27
- values = values.astype(np.float64)
28
-
29
- valid_mask = ~np.isnan(values)
30
- outliers = np.full(values.shape, False, dtype=bool)
31
-
32
- if not np.any(valid_mask):
33
- return outliers
34
-
35
- if method == "zscore":
36
- threshold = threshold if threshold is not None else 3.0
37
-
38
- std_val = np.nanstd(values)
39
-
40
- if std_val > EPSILON:
41
- mean_val = np.nanmean(values)
42
- abs_diff = np.abs(values - mean_val)
43
- outliers = (abs_diff / std_val) > threshold
59
+ nan_mask = np.isnan(values)
44
60
 
61
+ if np.all(nan_mask):
62
+ outliers = None
63
+ elif method == "zscore":
64
+ outliers = _get_zscore_mask(values.astype(np.float64), threshold)
45
65
  elif method == "modzscore":
46
- threshold = threshold if threshold is not None else 3.5
47
-
48
- median_val = np.nanmedian(values)
49
- abs_diff = np.abs(values - median_val)
50
- m_abs_diff = np.nanmedian(abs_diff)
51
- m_abs_diff = np.nanmean(abs_diff) if m_abs_diff <= EPSILON else m_abs_diff
52
-
53
- if m_abs_diff > EPSILON:
54
- mod_z_score = 0.6745 * abs_diff / m_abs_diff
55
- outliers = mod_z_score > threshold
56
-
66
+ outliers = _get_modzscore_mask(values.astype(np.float64), threshold)
57
67
  elif method == "iqr":
58
- threshold = threshold if threshold is not None else 1.5
59
-
60
- qrt = np.nanpercentile(values, q=(25, 75), method="midpoint")
61
- iqr_val = qrt[1] - qrt[0]
62
-
63
- if iqr_val > EPSILON:
64
- iqr_threshold = iqr_val * threshold
65
- outliers = (values < (qrt[0] - iqr_threshold)) | (values > (qrt[1] + iqr_threshold))
66
-
68
+ outliers = _get_iqr_mask(values.astype(np.float64), threshold)
67
69
  else:
68
70
  raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
69
71
 
70
- outliers[~valid_mask] = False
71
- return outliers
72
+ # If outliers were found, return the mask with NaN values set to False, otherwise return all False
73
+ return outliers & ~nan_mask if outliers is not None else np.full(values.shape, False, dtype=bool)
72
74
 
73
75
 
74
76
  class Outliers:
@@ -6,7 +6,6 @@ from typing import Any, Callable
6
6
 
7
7
  import numpy as np
8
8
 
9
- from dataeval.config import EPSILON
10
9
  from dataeval.metrics.stats._base import StatsProcessor, run_stats
11
10
  from dataeval.outputs import DimensionStatsOutput
12
11
  from dataeval.outputs._base import set_metadata
@@ -23,8 +22,8 @@ class DimensionStatsProcessor(StatsProcessor[DimensionStatsOutput]):
23
22
  "height": lambda x: x.box.height,
24
23
  "channels": lambda x: x.shape[-3],
25
24
  "size": lambda x: x.box.width * x.box.height,
26
- "aspect_ratio": lambda x: x.box.width / (x.box.height + EPSILON),
27
- "depth": lambda x: get_bitdepth(x.image).depth,
25
+ "aspect_ratio": lambda x: 0.0 if x.box.height == 0 else x.box.width / x.box.height,
26
+ "depth": lambda x: get_bitdepth(x.raw).depth,
28
27
  "center": lambda x: np.asarray([(x.box.x0 + x.box.x1) / 2, (x.box.y0 + x.box.y1) / 2]),
29
28
  "distance_center": lambda x: np.sqrt(
30
29
  np.square(((x.box.x0 + x.box.x1) / 2) - (x.raw.shape[-1] / 2))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.86.5
3
+ Version: 0.86.7
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,10 +1,10 @@
1
- dataeval/__init__.py,sha256=5qOVdEDEZt5O--VufuRJXGEByzQC7pJWZluFGzPuNOc,1636
1
+ dataeval/__init__.py,sha256=P6WvVjHlE2nH57bXBR4A9ez6R32OQGm9bshYrxRKwFw,1636
2
2
  dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
3
3
  dataeval/config.py,sha256=bHa8np4FCtLLv8_xlfdDC4lb1InJ_kT0vXDO5P42rvk,4082
4
4
  dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
5
5
  dataeval/data/_embeddings.py,sha256=PFjpdV9bfusCB4taTIYSzx1hP8nJb_KCkZTN8kMw-Hs,12885
6
6
  dataeval/data/_images.py,sha256=Rc_59CuU4zfN7Xm7an1XUx8ZghQg6a56VJWMZD9edRw,2654
7
- dataeval/data/_metadata.py,sha256=OTda9V7DA5Ejxip_NR16LCK2C8HMtpjWHHiFoW3LrLY,14364
7
+ dataeval/data/_metadata.py,sha256=3aixstlgcAZXC0qNjwDlxjscC3IX1xjPt_FK0liRqoo,14423
8
8
  dataeval/data/_selection.py,sha256=r06xeiyK8nTWPLyItkoPQRWZI1i6LATSue_cuEbCdc4,4463
9
9
  dataeval/data/_split.py,sha256=nQABR05vxil2Qx7-uX4Fm0_DWpibskBGDJOYj_b1u3I,16737
10
10
  dataeval/data/selections/__init__.py,sha256=2m8ZB53wXzqLcqmc6p5atO6graB6ZyiRSNJFxf11X_g,613
@@ -32,7 +32,7 @@ dataeval/detectors/drift/_uncertainty.py,sha256=BHlykJ-r7TGLJxdPfoazXnoAJ1qVDzbk
32
32
  dataeval/detectors/drift/updates.py,sha256=L1PnrPlIE1x6ujCc5mCwjcAZwadVTn-Zjb6MnTDvzJQ,2251
33
33
  dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
34
34
  dataeval/detectors/linters/duplicates.py,sha256=X5WSEvI_BHkLoXjkaHK6wTnSkx4IjpO_exMRjSlhc70,4963
35
- dataeval/detectors/linters/outliers.py,sha256=WO686jVbGbtDjO-8CuYVLxpeUGv8MpIK9QjADlTdd40,9596
35
+ dataeval/detectors/linters/outliers.py,sha256=GaM9n8yPgBPzVOL_bxJCj0eCwobEEP4JHKHD9liRdlw,10130
36
36
  dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
37
37
  dataeval/detectors/ood/ae.py,sha256=fTrUfFxv6xUqzKpwMC8rW3JrizA16M_bgzqLuBKMrS0,2944
38
38
  dataeval/detectors/ood/base.py,sha256=9b-Ljznf0lB1SXF4F_Aj3eJ4Y3ijGEDPMjucUsWOGJM,3051
@@ -56,7 +56,7 @@ dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97F
56
56
  dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
57
57
  dataeval/metrics/stats/_base.py,sha256=R-hxoEPLreZcxYxBfyjbKfdoGMMTPiqJ5g2zSO-1UYM,12541
58
58
  dataeval/metrics/stats/_boxratiostats.py,sha256=ROZrlqgbowkGfCR5PJ5TL7Og40iMOdUqJnsCtaz_Xek,6450
59
- dataeval/metrics/stats/_dimensionstats.py,sha256=EVO-BlxrZl8qrP09lwPbyWdrG1ZeDtgj4LiswDwEZ1I,2896
59
+ dataeval/metrics/stats/_dimensionstats.py,sha256=s2Juca8GG501nZd2SWL_YtXWkTfxUrUIAl53PO3_VeA,2876
60
60
  dataeval/metrics/stats/_hashstats.py,sha256=qa1CYRgOebkxqkALfffaPM-kJ074ZbyfpWbfOfuObSs,4758
61
61
  dataeval/metrics/stats/_imagestats.py,sha256=gUPNgN5Zwzdr7WnSwbve1NXNsyxd5dy3cSnlR_7guCg,3007
62
62
  dataeval/metrics/stats/_labelstats.py,sha256=_dXt3p8_-SHEtHvJWbL0rnQvO2g30zxX42mG2LGJepU,3195
@@ -107,7 +107,7 @@ dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Z
107
107
  dataeval/utils/torch/trainer.py,sha256=Oc2lK13uPGhmLYbmAqlPWyKxgG4YJFlnSXCqFHUZbdA,5528
108
108
  dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
109
109
  dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
110
- dataeval-0.86.5.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
111
- dataeval-0.86.5.dist-info/METADATA,sha256=qx7aNDgzyAfRRKWjDXkfXojBdsBFnjMgwTVl0JsLbbw,5353
112
- dataeval-0.86.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
113
- dataeval-0.86.5.dist-info/RECORD,,
110
+ dataeval-0.86.7.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
111
+ dataeval-0.86.7.dist-info/METADATA,sha256=7FTgPB4Yj2zF7z2B6IIRe9WFc9VCBqrcFEIf5ByVHdw,5353
112
+ dataeval-0.86.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
113
+ dataeval-0.86.7.dist-info/RECORD,,