dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/config.py +77 -0
  3. dataeval/detectors/__init__.py +1 -1
  4. dataeval/detectors/drift/__init__.py +6 -6
  5. dataeval/detectors/drift/{base.py → _base.py} +40 -85
  6. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  7. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  8. dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
  9. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  10. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
  11. dataeval/detectors/drift/updates.py +20 -3
  12. dataeval/detectors/linters/__init__.py +3 -5
  13. dataeval/detectors/linters/duplicates.py +13 -36
  14. dataeval/detectors/linters/outliers.py +23 -148
  15. dataeval/detectors/ood/__init__.py +1 -1
  16. dataeval/detectors/ood/ae.py +30 -9
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/mixin.py +21 -7
  19. dataeval/detectors/ood/vae.py +73 -0
  20. dataeval/metadata/__init__.py +6 -0
  21. dataeval/metadata/_distance.py +167 -0
  22. dataeval/metadata/_ood.py +217 -0
  23. dataeval/metadata/_utils.py +44 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +6 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
  27. dataeval/metrics/bias/_coverage.py +98 -0
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
  29. dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
  30. dataeval/metrics/estimators/__init__.py +15 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
  32. dataeval/metrics/estimators/_clusterer.py +44 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
  35. dataeval/metrics/stats/__init__.py +16 -13
  36. dataeval/metrics/stats/{base.py → _base.py} +82 -133
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
  38. dataeval/metrics/stats/_dimensionstats.py +75 -0
  39. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
  40. dataeval/metrics/stats/_imagestats.py +94 -0
  41. dataeval/metrics/stats/_labelstats.py +131 -0
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
  44. dataeval/outputs/__init__.py +53 -0
  45. dataeval/{output.py → outputs/_base.py} +55 -25
  46. dataeval/outputs/_bias.py +381 -0
  47. dataeval/outputs/_drift.py +83 -0
  48. dataeval/outputs/_estimators.py +114 -0
  49. dataeval/outputs/_linters.py +184 -0
  50. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  51. dataeval/outputs/_stats.py +387 -0
  52. dataeval/outputs/_utils.py +44 -0
  53. dataeval/outputs/_workflows.py +364 -0
  54. dataeval/typing.py +234 -0
  55. dataeval/utils/__init__.py +2 -2
  56. dataeval/utils/_array.py +169 -0
  57. dataeval/utils/_bin.py +199 -0
  58. dataeval/utils/_clusterer.py +144 -0
  59. dataeval/utils/_fast_mst.py +189 -0
  60. dataeval/utils/{image.py → _image.py} +6 -4
  61. dataeval/utils/_method.py +14 -0
  62. dataeval/utils/{shared.py → _mst.py} +3 -65
  63. dataeval/utils/{plot.py → _plot.py} +6 -6
  64. dataeval/utils/data/__init__.py +26 -0
  65. dataeval/utils/data/_dataset.py +217 -0
  66. dataeval/utils/data/_embeddings.py +104 -0
  67. dataeval/utils/data/_images.py +68 -0
  68. dataeval/utils/data/_metadata.py +360 -0
  69. dataeval/utils/data/_selection.py +126 -0
  70. dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
  71. dataeval/utils/data/_targets.py +85 -0
  72. dataeval/utils/data/collate.py +103 -0
  73. dataeval/utils/data/datasets/__init__.py +17 -0
  74. dataeval/utils/data/datasets/_base.py +254 -0
  75. dataeval/utils/data/datasets/_cifar10.py +134 -0
  76. dataeval/utils/data/datasets/_fileio.py +168 -0
  77. dataeval/utils/data/datasets/_milco.py +153 -0
  78. dataeval/utils/data/datasets/_mixin.py +56 -0
  79. dataeval/utils/data/datasets/_mnist.py +183 -0
  80. dataeval/utils/data/datasets/_ships.py +123 -0
  81. dataeval/utils/data/datasets/_types.py +52 -0
  82. dataeval/utils/data/datasets/_voc.py +352 -0
  83. dataeval/utils/data/selections/__init__.py +15 -0
  84. dataeval/utils/data/selections/_classfilter.py +57 -0
  85. dataeval/utils/data/selections/_indices.py +26 -0
  86. dataeval/utils/data/selections/_limit.py +26 -0
  87. dataeval/utils/data/selections/_reverse.py +18 -0
  88. dataeval/utils/data/selections/_shuffle.py +29 -0
  89. dataeval/utils/metadata.py +51 -376
  90. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  91. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  92. dataeval/utils/torch/models.py +43 -2
  93. dataeval/workflows/__init__.py +2 -1
  94. dataeval/workflows/sufficiency.py +11 -346
  95. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
  96. dataeval-0.82.0.dist-info/RECORD +104 -0
  97. dataeval/detectors/linters/clusterer.py +0 -512
  98. dataeval/detectors/linters/merged_stats.py +0 -49
  99. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  100. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  101. dataeval/interop.py +0 -69
  102. dataeval/metrics/bias/coverage.py +0 -194
  103. dataeval/metrics/stats/datasetstats.py +0 -202
  104. dataeval/metrics/stats/dimensionstats.py +0 -115
  105. dataeval/metrics/stats/labelstats.py +0 -210
  106. dataeval/utils/dataset/__init__.py +0 -7
  107. dataeval/utils/dataset/datasets.py +0 -412
  108. dataeval/utils/dataset/read.py +0 -63
  109. dataeval-0.76.1.dist-info/RECORD +0 -67
  110. /dataeval/{log.py → _log.py} +0 -0
  111. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  112. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
  113. {dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,104 @@
1
+ dataeval/__init__.py,sha256=aaXb18noAWzNZsE9bIlMnglK-d9BMJCm1wYsQyzX6sc,1510
2
+ dataeval/_log.py,sha256=Mn5bRWO0cgtAYd5VGYSFiPgu57ta3zoktrtHAZ1m3dU,357
3
+ dataeval/config.py,sha256=x55jqLFrlHvOcNqPXudVnF24yc3OAaEAu-q9NJZSIq4,2225
4
+ dataeval/detectors/__init__.py,sha256=3Sg-XWlwr75zEEH3hZKA4nWMtGvaRlnfzTWvZG_Ak6U,189
5
+ dataeval/detectors/drift/__init__.py,sha256=6is_XBtG1d-vUbhHvqXGOdnAwxJ7NA5yRfURn7pCeIw,651
6
+ dataeval/detectors/drift/_base.py,sha256=mJdKvyROgWvz-p1VlAIJqUI6BAj9ss8riUvR5An5wIw,13459
7
+ dataeval/detectors/drift/_cvm.py,sha256=H2w-I0eMD7yP-CSmpdodeJ0-TYznJT7w_H7JuobESow,3859
8
+ dataeval/detectors/drift/_ks.py,sha256=-5k3RBPA3kadX7oD14Wc52rAqQf1udwFeW7Qf3Sv4Tw,4058
9
+ dataeval/detectors/drift/_mmd.py,sha256=_z1ateuWy8TMtP20oTIOSwBeqkXTmo3C2_Q5_7QKnBs,7258
10
+ dataeval/detectors/drift/_torch.py,sha256=BY-AEqjkzX8fJnLJSBosHnsRsUorL0de_ysJjkZyS0s,7687
11
+ dataeval/detectors/drift/_uncertainty.py,sha256=WJBlMAPBKD1qRCc0lxkKIqux4tPdOT4p-rwhD6Vuu2Q,5703
12
+ dataeval/detectors/drift/updates.py,sha256=Btu2iaZW7fbO59G1w5v3ykFot0YPzy2U6VjF0d440VE,2195
13
+ dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
14
+ dataeval/detectors/linters/duplicates.py,sha256=om2d_wR3vtzI6CG_Apu74T9FMXllss99H0ELz_JFADQ,4935
15
+ dataeval/detectors/linters/outliers.py,sha256=1eiVrM_A-glZWw2-ISy0JYkM_Ki9JIuRnTVa-eXwQi0,9042
16
+ dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
17
+ dataeval/detectors/ood/ae.py,sha256=hB2iV8YhYceJUMPyop5048eB78hUrAkRCnyOyJY5-8o,2949
18
+ dataeval/detectors/ood/base.py,sha256=I2gW8cRWR-eBSI2zwESDrnYUEsMlhRsnWJWVyw4Jgkg,3047
19
+ dataeval/detectors/ood/metadata_ood_mi.py,sha256=7_Sdzf7-x1TlrIQvSyOIB98C8_UQhUwmwFQmZ9_q1Uc,4042
20
+ dataeval/detectors/ood/mixin.py,sha256=jc_mrtCRmeV51veiyD48sBxNc70-_MBT6ugNIB7D2W8,5431
21
+ dataeval/detectors/ood/vae.py,sha256=dEEf0TSnLl6xs80LEq0CEUlFvXIUtbOwoUNlnF8ig6g,2260
22
+ dataeval/metadata/__init__.py,sha256=B5Ix4T75UPEqY0rofaJlbRf8zCqx8yWLdE3Jo9cALHc,262
23
+ dataeval/metadata/_distance.py,sha256=zcuGFY4Zymp5U1S0OR9p1JT5zjqO0sAkmWfY5lxb9VY,4898
24
+ dataeval/metadata/_ood.py,sha256=k-7v8ZHdTC2TUCr07B1MtKIkGIHpPFJRKcH0Rey4pfY,8010
25
+ dataeval/metadata/_utils.py,sha256=r8qBJT83RblobD5W5zyTVi6vYi51Dwkqswizdbzss-M,1169
26
+ dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
27
+ dataeval/metrics/bias/__init__.py,sha256=1yTLmgiu1kwT_7ZWcjOUbj8R0NJ0DjGoCuWdA0_T7kc,683
28
+ dataeval/metrics/bias/_balance.py,sha256=x0daiY0TaiuanwxIbOPm7_0ksepE25nGULLrOotWqMU,5927
29
+ dataeval/metrics/bias/_coverage.py,sha256=PeUoOiaghUEdn6Ov8z2-am7-fnBVIPcFbJK7Ty5JObA,3647
30
+ dataeval/metrics/bias/_diversity.py,sha256=JvLN6tGIcGpRfQt4INdEyPQwU8OOLSiosPnMnh6RDd0,5668
31
+ dataeval/metrics/bias/_parity.py,sha256=heQr_CUcdhHU9x7kT3FtF8w30IEKsok798dRW5jOUGA,11384
32
+ dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
33
+ dataeval/metrics/estimators/_ber.py,sha256=TrZNO1frRldUDICLzaQGt9wuMiqmvsUFdkZ3cIVv9W4,5344
34
+ dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
35
+ dataeval/metrics/estimators/_divergence.py,sha256=QDWl1lyAYoO9D3Ho7qOHSk6ud8Gi2MGuXEsYwO1HxvA,4043
36
+ dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97FIu9VXA,1928
37
+ dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
38
+ dataeval/metrics/stats/_base.py,sha256=VlsoHatQBJ4XVNO8pMHUl-2NCC39fknWsQvs7dimMNA,10838
39
+ dataeval/metrics/stats/_boxratiostats.py,sha256=K0hkPuLYHHZJEJG8MOPEGcY7ASsRQLKpj7V7yy4-xAc,6341
40
+ dataeval/metrics/stats/_dimensionstats.py,sha256=73mFP-Myxne0peFliwvTntc0kk4cpq0krzMvSLDSIMM,2702
41
+ dataeval/metrics/stats/_hashstats.py,sha256=gp9X_pnTT3mPH9YNrWLdn2LQPK_epJ3dQRoyOCwmKlg,4758
42
+ dataeval/metrics/stats/_imagestats.py,sha256=Usxuc7_TJVNCm5SnwV6oYfoD333HQ6c4xjdth3N0b6Y,3000
43
+ dataeval/metrics/stats/_labelstats.py,sha256=PtGyqj4RHw0cyLAWAR9FzZGqgA81AtxLGHZiuMAL2h0,4100
44
+ dataeval/metrics/stats/_pixelstats.py,sha256=SfergRbjNJE4h0xqe-0c8RnKtZmEkZ9MwExdipLSGvg,3247
45
+ dataeval/metrics/stats/_visualstats.py,sha256=cq4AbF2B50Ihbzb86FphcnKQ1TSwNnP3PsnbpiPQZWw,3698
46
+ dataeval/outputs/__init__.py,sha256=sXWjCvB4-uFMUGFHGEhZxp7jid39xN5AOWbFMQ2SPeE,1419
47
+ dataeval/outputs/_base.py,sha256=PKu-jo6jxuuCHFFpY147wxSI7vp-6LlKVSTudYJ86GQ,4817
48
+ dataeval/outputs/_bias.py,sha256=aTsDeG48LAW_Z5kcKpJzcP4NpwQSNveZeG51sCHUcQo,12171
49
+ dataeval/outputs/_drift.py,sha256=gOiu2C-ERTWiRqlP0auMYxPBGdm9HecWPqWfg7I4tZg,2015
50
+ dataeval/outputs/_estimators.py,sha256=a2oAIxxEDZ9WLGfMWH8KD-BVUS_SnULRPR-iI9hFPoQ,3047
51
+ dataeval/outputs/_linters.py,sha256=kbzJne6ZFu795JVBK_p7jv_U72cqErhKXHsitzYBgPE,6396
52
+ dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
53
+ dataeval/outputs/_stats.py,sha256=NrINChtF3D7Nq6VeXzhTk7ZszDWnluZ4HhLELXeK4xw,13010
54
+ dataeval/outputs/_utils.py,sha256=HHlGC7sk416m_3Bgn075Qdblz_aPup_UOafJpB0RuXY,893
55
+ dataeval/outputs/_workflows.py,sha256=MkRD6ubI4NCBXb9v3kjXy64cUGs3G-JKkBdOpRD9XVE,10750
56
+ dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ dataeval/typing.py,sha256=YNSCZ6V39JaPoXwOEexIOIejTjoCQTE9UA_DbHyW34o,5824
58
+ dataeval/utils/__init__.py,sha256=T8F8zJh4ZAeu0wDzfpld92I2zJg9mWBmkGCHrDPU7gk,264
59
+ dataeval/utils/_array.py,sha256=fc04sYShIdsRS4qtG1UCnlGGk-yVRxlOHTNAmW7NpDY,4990
60
+ dataeval/utils/_bin.py,sha256=nylthmsC3vzLHLhlUMACvZs--h7xvAh9Pt75InaQJW8,7322
61
+ dataeval/utils/_clusterer.py,sha256=fw5x-2QN0TIbiodDKHZxRgxKHINedpPcOklzce0Rbjg,5436
62
+ dataeval/utils/_fast_mst.py,sha256=4_7ykVihCL5jWtxcGnrecIsDQo65kUml9SZ1JxgBZYY,7172
63
+ dataeval/utils/_image.py,sha256=capzF_X5H0jy0PmTP3Hf52GFgLqrnfU6gS4tiwck9jo,1939
64
+ dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
65
+ dataeval/utils/_mst.py,sha256=gXjUUhz9G4wkcCUTqQ-61Ti9sZUFx08hEjlZXWiEmPc,2163
66
+ dataeval/utils/_plot.py,sha256=mTRQNbJsA42QMiOwZbJaH8sNYgP996QFDEGVVE9HSgY,7076
67
+ dataeval/utils/data/__init__.py,sha256=vldQ2ZXl8gnI3s4vAGqUUVi6dc_R58F3JMSpbCOyFRI,820
68
+ dataeval/utils/data/_dataset.py,sha256=tjZUJnxj9IY71GKqdKltrwufkn0EC0S3a6ylrW5Bc2s,7756
69
+ dataeval/utils/data/_embeddings.py,sha256=6yMzMT7tHRLaepuHVXom_ffvSwxatjppQZYJj1uKoe0,3565
70
+ dataeval/utils/data/_images.py,sha256=pv_vvpH8hWxPgLvjeVC2mZiyZivZFNLARNIOXam5ceY,1984
71
+ dataeval/utils/data/_metadata.py,sha256=Naxzf68V5_8oYYXRfO99-86LQQYwopr6Q_hfzCD5oZ4,13841
72
+ dataeval/utils/data/_selection.py,sha256=vpiYK6UqOXabqeXuhLnRuGJAOtw3ErqpxSc4bK7B7c0,4202
73
+ dataeval/utils/data/_split.py,sha256=ap_h52ncVev87VFbn2_WUHb0ZreUQQYokqz1SLF8-uI,18346
74
+ dataeval/utils/data/_targets.py,sha256=ws5d9wRiDkIuOV7GSAKNxzgSm6AWTgb0BFroQK5nAmM,3057
75
+ dataeval/utils/data/collate.py,sha256=Z5nmBnWV_IoJzMp_tj8RCKjMJA9sSCY_zZITqISGixc,3865
76
+ dataeval/utils/data/datasets/__init__.py,sha256=jBrswiERrvBx4pJQJZIq_B5UE-Wy8a2_SBfM2crG8R8,511
77
+ dataeval/utils/data/datasets/_base.py,sha256=lvC13xCy6DDlDBSu5PSdU73ySyeBde2Q91NxrMtos_s,8732
78
+ dataeval/utils/data/datasets/_cifar10.py,sha256=tDALqZKKcUbG6dHZm4MXQMhUCS1W_U-aC9aOu4fjxcM,5160
79
+ dataeval/utils/data/datasets/_fileio.py,sha256=SixIk5nIlIwJdX9zjNXS10vHA3hL8aaYbqHsDg1xSpY,6447
80
+ dataeval/utils/data/datasets/_milco.py,sha256=8FLMNn1qI92zGlsB3sAqDTATdK807wqQNCCiSL5lBOM,6068
81
+ dataeval/utils/data/datasets/_mixin.py,sha256=FJgZP_cpJkgAHA3j3ai_j3Wt7aFSEjIMVmt9NpvVXzg,1757
82
+ dataeval/utils/data/datasets/_mnist.py,sha256=PmEEyhJ50Wo4utCYYDJUSgY69FCvXgAhndSCTjfrWN0,7223
83
+ dataeval/utils/data/datasets/_ships.py,sha256=b521rr8OYuBBTHerBtIdYRTGnEs-E8AOZ2hsa2BvkN4,4361
84
+ dataeval/utils/data/datasets/_types.py,sha256=OOxgMjX3QnB-M2O_NqBOB9xsU26vmJO1CVZLksgiuSY,1203
85
+ dataeval/utils/data/datasets/_voc.py,sha256=_fqcwi7AQXmCkYomLe-4_u6wRAWCkBKYNd3HJGnINsY,13837
86
+ dataeval/utils/data/selections/__init__.py,sha256=RLjkIh2IAvPktLbUmyLv3p-rvDEaBAdWzjiNnnhVtn8,481
87
+ dataeval/utils/data/selections/_classfilter.py,sha256=jO_N7AmPMpkMW82Nrk6FU8hcOlxX-0vmmVeUZGU9Lzc,2295
88
+ dataeval/utils/data/selections/_indices.py,sha256=QdLgXN7GABCvGPYe28PV1RAc_RSP_nZOyCvEpKRBdWg,636
89
+ dataeval/utils/data/selections/_limit.py,sha256=ECvHRsp7OF4LZw2tE4sGqqJ085kjC-hd2c7QDMfvXr8,518
90
+ dataeval/utils/data/selections/_reverse.py,sha256=6SWpELC9Wgx-kPqzhDrPNn4NKU6FqDJveLrxV4D2Ypk,374
91
+ dataeval/utils/data/selections/_shuffle.py,sha256=U2dQPlX5JQhLjpqlk_uztks8G0H_GAl2DOl6ADNJaDY,581
92
+ dataeval/utils/metadata.py,sha256=X8Hu4LdCzAaE9uk1hI4BflmFve_VOQCqK9lXq0sk9ow,14196
93
+ dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
94
+ dataeval/utils/torch/_blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
95
+ dataeval/utils/torch/_gmm.py,sha256=XBHNLPTtLGRrzq0B4GI48Sha7YHL-0PpXil3s3exLGE,3714
96
+ dataeval/utils/torch/_internal.py,sha256=5BYibQvvXS-trsHi2x7gjxuaknLwSyj6yWXbOFEdx-M,5790
97
+ dataeval/utils/torch/models.py,sha256=hmroEs6C6jQ5tAoZa71RFeIvXLxfXrTJSFH_jG2LGQU,9749
98
+ dataeval/utils/torch/trainer.py,sha256=Qay0LK63RuyoGYiJ5zI2C5BVym309ORvp6shhpcrIU4,5589
99
+ dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
100
+ dataeval/workflows/sufficiency.py,sha256=mjKmfRrAjShLUFIARv5o8yT5fnFvDsS5Qu6ujIPUgQg,8497
101
+ dataeval-0.82.0.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
102
+ dataeval-0.82.0.dist-info/METADATA,sha256=pBBKLbnfGy4_THG8YbCZHH4RY2HPbLrDAEWhuL2mCtQ,5304
103
+ dataeval-0.82.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
104
+ dataeval-0.82.0.dist-info/RECORD,,
@@ -1,512 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable, NamedTuple, cast
7
-
8
- import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
10
- from scipy.cluster.hierarchy import linkage
11
- from scipy.spatial.distance import pdist, squareform
12
-
13
- from dataeval.interop import to_numpy
14
- from dataeval.output import Output, set_metadata
15
- from dataeval.utils.shared import flatten
16
-
17
-
18
- @dataclass(frozen=True)
19
- class ClustererOutput(Output):
20
- """
21
- Output class for :class:`Clusterer` lint detector.
22
-
23
- Attributes
24
- ----------
25
- outliers : List[int]
26
- Indices that do not fall within a cluster
27
- potential_outliers : List[int]
28
- Indices which are near the border between belonging in the cluster and being an outlier
29
- duplicates : List[List[int]]
30
- Groups of indices that are exact :term:`duplicates<Duplicates>`
31
- potential_duplicates : List[List[int]]
32
- Groups of indices which are not exact but closely related data points
33
- """
34
-
35
- outliers: list[int]
36
- potential_outliers: list[int]
37
- duplicates: list[list[int]]
38
- potential_duplicates: list[list[int]]
39
-
40
-
41
- def _extend_linkage(link_arr: NDArray) -> NDArray:
42
- """
43
- Adds a column to the linkage matrix link_arr that tracks the new id assigned
44
- to each row
45
-
46
- Parameters
47
- ----------
48
- link_arr : NDArray
49
- linkage matrix
50
-
51
- Returns
52
- -------
53
- NDArray
54
- linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
55
- """
56
- # Adjusting linkage matrix to accommodate renumbering
57
- rows, cols = link_arr.shape
58
- arr = np.zeros((rows, cols + 1))
59
- arr[:, :-1] = link_arr
60
- arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
61
-
62
- return arr
63
-
64
-
65
- class _Cluster:
66
- __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
67
-
68
- def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
69
- self.merged = merged
70
- self.samples = np.array(samples, dtype=np.int32)
71
- self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
72
- self.is_copy = is_copy
73
-
74
- dist = float(self.sample_dist[-1])
75
-
76
- self.count = len(self.samples)
77
- if is_copy:
78
- self.dist_avg = 0.0
79
- self.dist_std = 0.0
80
- self.out1 = False
81
- self.out2 = False
82
- else:
83
- self.dist_avg = float(np.mean(self.sample_dist))
84
- self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
85
- out1 = self.dist_avg + self.dist_std
86
- out2 = out1 + self.dist_std
87
- self.out1 = dist > out1
88
- self.out2 = dist > out2
89
-
90
- def copy(self) -> _Cluster:
91
- return _Cluster(False, self.samples, self.sample_dist, True)
92
-
93
- def __repr__(self) -> str:
94
- _params = {
95
- "merged": self.merged,
96
- "samples": self.samples,
97
- "sample_dist": self.sample_dist,
98
- "is_copy": self.is_copy,
99
- }
100
- return f"{self.__class__.__name__}(**{repr(_params)})"
101
-
102
-
103
- class _Clusters(dict[int, dict[int, _Cluster]]):
104
- def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
105
- super().__init__(*args)
106
- self.max_level: int = 1
107
-
108
-
109
- class _ClusterPosition(NamedTuple):
110
- """Keeps track of a cluster's level and ID"""
111
-
112
- level: int
113
- cid: int
114
-
115
-
116
- class _ClusterMergeEntry:
117
- __slots__ = "level", "outer_cluster", "inner_cluster", "status"
118
-
119
- def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
120
- self.level = level
121
- self.outer_cluster = outer_cluster
122
- self.inner_cluster = inner_cluster
123
- self.status = status
124
-
125
- def __lt__(self, value: _ClusterMergeEntry) -> bool:
126
- return self.level.__lt__(value.level)
127
-
128
- def __gt__(self, value: _ClusterMergeEntry) -> bool:
129
- return self.level.__gt__(value.level)
130
-
131
-
132
- class Clusterer:
133
- """
134
- Uses hierarchical clustering to flag dataset properties of interest like outliers \
135
- and :term:`duplicates<Duplicates>`.
136
-
137
- Parameters
138
- ----------
139
- dataset : ArrayLike, shape - (N, P)
140
- A dataset in an ArrayLike format.
141
- Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
142
-
143
- Warning
144
- -------
145
- The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
146
-
147
- Note
148
- ----
149
- The Clusterer works best when the length of the feature dimension, P, is less than 500.
150
- If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
151
- """
152
-
153
- def __init__(self, dataset: ArrayLike) -> None:
154
- # Allows an update to dataset to reset the state rather than instantiate a new class
155
- self._on_init(dataset)
156
-
157
- def _on_init(self, dataset: ArrayLike):
158
- self._data: NDArray[Any] = flatten(to_numpy(dataset))
159
- self._validate_data(self._data)
160
- self._num_samples = len(self._data)
161
-
162
- self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
163
- self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
164
- self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
165
- self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
166
-
167
- min_num = int(self._num_samples * 0.05)
168
- self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
169
-
170
- self._clusters: _Clusters | None = None
171
- self._last_good_merge_levels: dict[int, int] | None = None
172
-
173
- @property
174
- def data(self) -> NDArray[Any]:
175
- return self._data
176
-
177
- @data.setter
178
- def data(self, x: ArrayLike) -> None:
179
- self._on_init(x)
180
-
181
- @property
182
- def clusters(self) -> _Clusters:
183
- if self._clusters is None:
184
- self._clusters = self._create_clusters()
185
- return self._clusters
186
-
187
- @property
188
- def last_good_merge_levels(self) -> dict[int, int]:
189
- if self._last_good_merge_levels is None:
190
- self._last_good_merge_levels = self._get_last_merge_levels()
191
- return self._last_good_merge_levels
192
-
193
- @classmethod
194
- def _validate_data(cls, x: NDArray):
195
- """Checks that the data has the correct size, shape, and format"""
196
- if not isinstance(x, np.ndarray):
197
- raise TypeError(f"Data should be of type NDArray; got {type(x)}")
198
-
199
- if x.ndim != 2:
200
- raise ValueError(
201
- f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
202
- )
203
- samples, features = x.shape # Due to above check, we know shape has a length of 2
204
- if samples < 2:
205
- raise ValueError(f"Data should have at least 2 samples; got {samples}")
206
- if features < 1:
207
- raise ValueError(f"Samples should have at least 1 feature; got {features}")
208
-
209
- def _create_clusters(self) -> _Clusters:
210
- """Generates clusters based on linkage matrix"""
211
- next_cluster_id = 0
212
- cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
213
- clusters: _Clusters = _Clusters()
214
-
215
- # Walking through the linkage array to generate clusters
216
- for arr_i in self._larr:
217
- left_id = int(arr_i[0])
218
- right_id = int(arr_i[1])
219
- sample_dist = np.array([arr_i[2]], dtype=np.float32)
220
- merged = False
221
-
222
- # Determine if the id is already associated with a cluster
223
- left = cluster_map.get(left_id)
224
- right = cluster_map.get(right_id)
225
-
226
- if left and right:
227
- merged = max([left.cid, right.cid])
228
- lc = clusters[left.level][left.cid]
229
- rc = clusters[right.level][right.cid]
230
- left_first = len(lc.samples) >= len(rc.samples)
231
- samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
232
- sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
233
- level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
234
-
235
- # Only tracking the levels in which clusters merge for the cluster distance matrix
236
- clusters.max_level = max(clusters.max_level, left.level, right.level)
237
- # Update clusters to include previously skipped levels
238
- clusters = self._fill_levels(clusters, left, right)
239
- elif left or right:
240
- child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
241
- cc = clusters[child.level][child.cid]
242
- samples = np.concatenate([cc.samples, [other_id]])
243
- sample_dist = np.concatenate([cc.sample_dist, sample_dist])
244
- level, cid = child.level + 1, child.cid
245
- else:
246
- samples = np.array([left_id, right_id], dtype=np.int32)
247
- level, cid = 0, next_cluster_id
248
- next_cluster_id += 1
249
-
250
- # Set the cluster and associate the linkage id with the cluster
251
- if level not in clusters:
252
- clusters[level] = {}
253
-
254
- clusters[level][cid] = _Cluster(merged, samples, sample_dist)
255
- cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
256
-
257
- return clusters
258
-
259
- def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
260
- # Sets each level's cluster info if it does not exist
261
- if left.level != right.level:
262
- (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
263
- cluster = clusters[level][cid].copy()
264
- for level_id in range(max_level, level, -1):
265
- clusters[level_id].setdefault(cid, cluster)
266
- return clusters
267
-
268
- def _get_cluster_distances(self) -> NDArray:
269
- """Calculates the minimum distances between clusters are each level"""
270
- # Cluster distance matrix
271
- max_level = self.clusters.max_level
272
- cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
273
-
274
- for level, cluster_set in self.clusters.items():
275
- if level < max_level:
276
- cluster_ids = sorted(cluster_set.keys())
277
- for i, cluster_id in enumerate(cluster_ids):
278
- cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
279
- for int_id in range(i + 1, len(cluster_ids)):
280
- compare_id = cluster_ids[int_id]
281
- sample_a = self.clusters[level][cluster_id].samples
282
- sample_b = self.clusters[level][compare_id].samples
283
- min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
284
- cluster_matrix[level, cluster_id, compare_id] = min_mat
285
- cluster_matrix[level, compare_id, cluster_id] = min_mat
286
-
287
- return cluster_matrix
288
-
289
- def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
290
- """
291
- Determine what clusters should be merged and return their indices
292
- """
293
- intra_max_uniques = np.unique(intra_max)
294
- intra_log_values = np.log(intra_max_uniques)
295
- two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
296
- merge_value = np.log(merge_mean)
297
- # Mask of indices we know we want to merge
298
- desired_merge = merge_value < two_std_all
299
-
300
- # List[Values] for indices we might want to merge
301
- check = merge_value[~desired_merge]
302
- # Check distance from value to 2 stds of all values
303
- check = np.abs((check - two_std_all) / two_std_all)
304
- # Mask List[Values < 1]
305
- mask = check < 1
306
- one_std_check = check[mask].mean() + check[mask].std()
307
- # Mask of indices that should also be merged
308
- mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
309
- mask2 = mask2_vals < one_std_check
310
- return np.logical_or(desired_merge, mask2)
311
-
312
- def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
313
- """
314
- Runs through the clusters dictionary determining when clusters merge,
315
- and how close are those clusters when they merge.
316
-
317
- Parameters
318
- ----------
319
- cluster_matrix:
320
- The distance matrix for all clusters to all others
321
-
322
- Returns
323
- -------
324
- List[ClusterMergeEntry]:
325
- A list with each cluster's merge history
326
- """
327
- intra_max = []
328
- merge_mean = []
329
- merge_list: list[_ClusterMergeEntry] = []
330
-
331
- for level, cluster_set in self.clusters.items():
332
- for outer_cluster, cluster in cluster_set.items():
333
- inner_cluster = cluster.merged
334
- if not inner_cluster:
335
- continue
336
- # Extract necessary information
337
- num_samples = len(cluster.samples)
338
- out1 = cluster.out1
339
- out2 = cluster.out2
340
-
341
- # If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
342
- aggregate_func = (
343
- np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
344
- )
345
-
346
- distances = cluster_matrix[:level, outer_cluster, inner_cluster]
347
- intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
348
- positive_mask = intra_distance >= 0
349
- intra_filtered = intra_distance[positive_mask]
350
-
351
- # TODO: Append now, take max over axis later?
352
- intra_max.append(np.max(intra_filtered))
353
- # Calculate the corresponding distance stats
354
- distance_stats_arr = aggregate_func(distances)
355
- merge_mean.append(distance_stats_arr)
356
- merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
357
-
358
- all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
359
-
360
- for i, is_mergeable in enumerate(all_merge_indices):
361
- merge_list[i].status = is_mergeable
362
-
363
- merge_list = sorted(merge_list, reverse=True)
364
-
365
- return merge_list
366
-
367
- def _get_last_merge_levels(self) -> dict[int, int]:
368
- """
369
- Creates a dictionary for important cluster ids mapped to their last good merge level
370
-
371
- Returns
372
- -------
373
- Dict[int, int]
374
- A mapping of a cluster id to its last good merge level
375
- """
376
- last_merge_levels: dict[int, int] = {}
377
-
378
- if self._max_clusters <= 1:
379
- last_merge_levels = {0: int(self._num_samples * 0.1)}
380
- else:
381
- cluster_matrix = self._get_cluster_distances()
382
- merge_list = self._generate_merge_list(cluster_matrix)
383
- for entry in merge_list:
384
- if not entry.status:
385
- if entry.outer_cluster not in last_merge_levels:
386
- last_merge_levels[entry.outer_cluster] = 0
387
- if entry.inner_cluster not in last_merge_levels:
388
- last_merge_levels[entry.inner_cluster] = 0
389
- if last_merge_levels[entry.outer_cluster] > entry.level:
390
- last_merge_levels[entry.outer_cluster] = entry.level - 1
391
- else:
392
- if entry.outer_cluster in last_merge_levels:
393
- last_merge_levels[entry.outer_cluster] = max(
394
- last_merge_levels[entry.outer_cluster], entry.level
395
- )
396
-
397
- return last_merge_levels
398
-
399
- def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
400
- """
401
- Retrieves Outliers based on when the sample was added to the cluster
402
- and how far it was from the cluster when it was added
403
-
404
- Parameters
405
- ----------
406
- last_merge_levels : Dict[int, int]
407
- A mapping of a cluster id to its last good merge level
408
-
409
- Returns
410
- -------
411
- Tuple[List[int], List[int]]
412
- The outliers and possible outliers as sorted lists of indices
413
- """
414
- outliers = set()
415
- possible_outliers = set()
416
- already_seen = set()
417
- last_level = {}
418
-
419
- for level, cluster_set in self.clusters.items():
420
- for cluster_id, cluster in cluster_set.items():
421
- if cluster_id in last_merge_levels:
422
- last_level[cluster_id] = level
423
-
424
- for level, cluster_set in self.clusters.items():
425
- for cluster_id, cluster in cluster_set.items():
426
- if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
427
- if cluster_id in already_seen and cluster.samples[-1] not in outliers:
428
- outliers.add(cluster.samples[-1])
429
- elif cluster.out2:
430
- if len(cluster.samples) < self._min_num_samples_per_cluster:
431
- outliers.update(cluster.samples.tolist())
432
- elif cluster.samples[-1] not in outliers:
433
- outliers.add(cluster.samples[-1])
434
- if cluster_id not in already_seen:
435
- already_seen.add(cluster_id)
436
- elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
437
- possible_outliers.add(cluster.samples[-1])
438
- elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
439
- outliers.update(cluster.samples.tolist())
440
-
441
- return sorted(outliers), sorted(possible_outliers)
442
-
443
- def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
444
- """Merges and sorts groups of indices that share any common index"""
445
- groups: list[list[int]] = []
446
- for indices in zip(*index_groups):
447
- indices = set(indices)
448
- temp = []
449
- for group in groups:
450
- if not set(group).isdisjoint(indices):
451
- indices.update(group)
452
- else:
453
- temp.append(group)
454
- temp.append(sorted(indices))
455
- groups = temp
456
- return sorted(groups)
457
-
458
- def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
459
- """
460
- Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
461
-
462
- Parameters
463
- ----------
464
- last_merge_levels : Dict[int, int]
465
- A mapping of a cluster id to its last good merge level
466
-
467
- Returns
468
- -------
469
- Tuple[List[List[int]], List[List[int]]]
470
- The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
471
- """
472
-
473
- duplicates_std = []
474
- for cluster_id, level in last_merge_levels.items():
475
- samples = self.clusters[level][cluster_id].samples
476
- if len(samples) >= self._min_num_samples_per_cluster:
477
- duplicates_std.append(self.clusters[level][cluster_id].dist_std)
478
- diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
479
- np.fill_diagonal(diag_mask, 0)
480
- diag_mask = np.triu(diag_mask)
481
-
482
- exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
483
- exact_indices = np.nonzero(exact_mask & diag_mask)
484
- exact_dupes = self._sorted_union_find(exact_indices)
485
-
486
- near_mask = self._sqdmat <= np.mean(duplicates_std)
487
- near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
488
- near_dupes = self._sorted_union_find(near_indices)
489
-
490
- return exact_dupes, near_dupes
491
-
492
- # TODO: Move data input to evaluate from class
493
- @set_metadata(state=["data"])
494
- def evaluate(self) -> ClustererOutput:
495
- """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
496
-
497
- Returns
498
- -------
499
- ClustererOutput
500
- The Outliers and duplicate indices found in the data
501
-
502
- Example
503
- -------
504
- >>> cluster = Clusterer(clusterer_images)
505
- >>> cluster.evaluate()
506
- ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
507
- """ # noqa: E501
508
-
509
- outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
510
- duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
511
-
512
- return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)
@@ -1,49 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from copy import deepcopy
6
- from typing import Sequence, TypeVar
7
-
8
- import numpy as np
9
-
10
- from dataeval.metrics.stats.base import BaseStatsOutput
11
-
12
- TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
13
-
14
-
15
- def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
16
- if type(a) is not type(b):
17
- raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
18
-
19
- sum_dict = deepcopy(a.dict())
20
-
21
- for k in sum_dict:
22
- if isinstance(sum_dict[k], list):
23
- sum_dict[k].extend(b.dict()[k])
24
- else:
25
- sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
26
-
27
- return type(a)(**sum_dict)
28
-
29
-
30
- def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
31
- output = None
32
- dataset_steps = []
33
- cur_len = 0
34
- for s in stats:
35
- output = s if output is None else add_stats(output, s)
36
- cur_len += len(s)
37
- dataset_steps.append(cur_len)
38
- if output is None:
39
- raise TypeError("Cannot combine empty sequence of stats.")
40
- return output, dataset_steps
41
-
42
-
43
- def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
44
- last_step = 0
45
- for i, step in enumerate(dataset_steps):
46
- if idx < step:
47
- return i, idx - last_step
48
- last_step = step
49
- return -1, idx