PyPI - dataeval - Versions diffs - 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl - Mend

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

dataeval/__init__.py +3 -3
dataeval/config.py +77 -0
dataeval/detectors/__init__.py +1 -1
dataeval/detectors/drift/__init__.py +6 -6
dataeval/detectors/drift/{base.py → _base.py} +40 -85
dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
dataeval/detectors/drift/{mmd.py → _mmd.py} +31 -43
dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +24 -7
dataeval/detectors/drift/updates.py +20 -3
dataeval/detectors/linters/__init__.py +3 -5
dataeval/detectors/linters/duplicates.py +13 -36
dataeval/detectors/linters/outliers.py +23 -148
dataeval/detectors/ood/__init__.py +1 -1
dataeval/detectors/ood/ae.py +30 -9
dataeval/detectors/ood/base.py +5 -4
dataeval/detectors/ood/mixin.py +21 -7
dataeval/detectors/ood/vae.py +73 -0
dataeval/metadata/__init__.py +6 -0
dataeval/metadata/_distance.py +167 -0
dataeval/metadata/_ood.py +217 -0
dataeval/metadata/_utils.py +44 -0
dataeval/metrics/__init__.py +1 -1
dataeval/metrics/bias/__init__.py +6 -4
dataeval/metrics/bias/{balance.py → _balance.py} +15 -101
dataeval/metrics/bias/_coverage.py +98 -0
dataeval/metrics/bias/{diversity.py → _diversity.py} +18 -111
dataeval/metrics/bias/{parity.py → _parity.py} +39 -77
dataeval/metrics/estimators/__init__.py +15 -4
dataeval/metrics/estimators/{ber.py → _ber.py} +42 -29
dataeval/metrics/estimators/_clusterer.py +44 -0
dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -30
dataeval/metrics/estimators/{uap.py → _uap.py} +4 -18
dataeval/metrics/stats/__init__.py +16 -13
dataeval/metrics/stats/{base.py → _base.py} +82 -133
dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +15 -18
dataeval/metrics/stats/_dimensionstats.py +75 -0
dataeval/metrics/stats/{hashstats.py → _hashstats.py} +21 -37
dataeval/metrics/stats/_imagestats.py +94 -0
dataeval/metrics/stats/_labelstats.py +131 -0
dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +19 -50
dataeval/metrics/stats/{visualstats.py → _visualstats.py} +23 -54
dataeval/outputs/__init__.py +53 -0
dataeval/{output.py → outputs/_base.py} +55 -25
dataeval/outputs/_bias.py +381 -0
dataeval/outputs/_drift.py +83 -0
dataeval/outputs/_estimators.py +114 -0
dataeval/outputs/_linters.py +184 -0
dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
dataeval/outputs/_stats.py +387 -0
dataeval/outputs/_utils.py +44 -0
dataeval/outputs/_workflows.py +364 -0
dataeval/typing.py +234 -0
dataeval/utils/__init__.py +2 -2
dataeval/utils/_array.py +169 -0
dataeval/utils/_bin.py +199 -0
dataeval/utils/_clusterer.py +144 -0
dataeval/utils/_fast_mst.py +189 -0
dataeval/utils/{image.py → _image.py} +6 -4
dataeval/utils/_method.py +14 -0
dataeval/utils/{shared.py → _mst.py} +3 -65
dataeval/utils/{plot.py → _plot.py} +6 -6
dataeval/utils/data/__init__.py +26 -0
dataeval/utils/data/_dataset.py +217 -0
dataeval/utils/data/_embeddings.py +104 -0
dataeval/utils/data/_images.py +68 -0
dataeval/utils/data/_metadata.py +360 -0
dataeval/utils/data/_selection.py +126 -0
dataeval/utils/{dataset/split.py → data/_split.py} +12 -38
dataeval/utils/data/_targets.py +85 -0
dataeval/utils/data/collate.py +103 -0
dataeval/utils/data/datasets/__init__.py +17 -0
dataeval/utils/data/datasets/_base.py +254 -0
dataeval/utils/data/datasets/_cifar10.py +134 -0
dataeval/utils/data/datasets/_fileio.py +168 -0
dataeval/utils/data/datasets/_milco.py +153 -0
dataeval/utils/data/datasets/_mixin.py +56 -0
dataeval/utils/data/datasets/_mnist.py +183 -0
dataeval/utils/data/datasets/_ships.py +123 -0
dataeval/utils/data/datasets/_types.py +52 -0
dataeval/utils/data/datasets/_voc.py +352 -0
dataeval/utils/data/selections/__init__.py +15 -0
dataeval/utils/data/selections/_classfilter.py +57 -0
dataeval/utils/data/selections/_indices.py +26 -0
dataeval/utils/data/selections/_limit.py +26 -0
dataeval/utils/data/selections/_reverse.py +18 -0
dataeval/utils/data/selections/_shuffle.py +29 -0
dataeval/utils/metadata.py +51 -376
dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
dataeval/utils/torch/{internal.py → _internal.py} +21 -51
dataeval/utils/torch/models.py +43 -2
dataeval/workflows/__init__.py +2 -1
dataeval/workflows/sufficiency.py +11 -346
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/METADATA +5 -2
dataeval-0.82.0.dist-info/RECORD +104 -0
dataeval/detectors/linters/clusterer.py +0 -512
dataeval/detectors/linters/merged_stats.py +0 -49
dataeval/detectors/ood/metadata_ks_compare.py +0 -129
dataeval/detectors/ood/metadata_least_likely.py +0 -119
dataeval/interop.py +0 -69
dataeval/metrics/bias/coverage.py +0 -194
dataeval/metrics/stats/datasetstats.py +0 -202
dataeval/metrics/stats/dimensionstats.py +0 -115
dataeval/metrics/stats/labelstats.py +0 -210
dataeval/utils/dataset/__init__.py +0 -7
dataeval/utils/dataset/datasets.py +0 -412
dataeval/utils/dataset/read.py +0 -63
dataeval-0.76.1.dist-info/RECORD +0 -67
/dataeval/{log.py → _log.py} +0 -0
/dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/LICENSE.txt +0 -0
{dataeval-0.76.1.dist-info → dataeval-0.82.0.dist-info}/WHEEL +0 -0

dataeval-0.82.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,104 @@
+dataeval/__init__.py,sha256=aaXb18noAWzNZsE9bIlMnglK-d9BMJCm1wYsQyzX6sc,1510
+dataeval/_log.py,sha256=Mn5bRWO0cgtAYd5VGYSFiPgu57ta3zoktrtHAZ1m3dU,357
+dataeval/config.py,sha256=x55jqLFrlHvOcNqPXudVnF24yc3OAaEAu-q9NJZSIq4,2225
+dataeval/detectors/__init__.py,sha256=3Sg-XWlwr75zEEH3hZKA4nWMtGvaRlnfzTWvZG_Ak6U,189
+dataeval/detectors/drift/__init__.py,sha256=6is_XBtG1d-vUbhHvqXGOdnAwxJ7NA5yRfURn7pCeIw,651
+dataeval/detectors/drift/_base.py,sha256=mJdKvyROgWvz-p1VlAIJqUI6BAj9ss8riUvR5An5wIw,13459
+dataeval/detectors/drift/_cvm.py,sha256=H2w-I0eMD7yP-CSmpdodeJ0-TYznJT7w_H7JuobESow,3859
+dataeval/detectors/drift/_ks.py,sha256=-5k3RBPA3kadX7oD14Wc52rAqQf1udwFeW7Qf3Sv4Tw,4058
+dataeval/detectors/drift/_mmd.py,sha256=_z1ateuWy8TMtP20oTIOSwBeqkXTmo3C2_Q5_7QKnBs,7258
+dataeval/detectors/drift/_torch.py,sha256=BY-AEqjkzX8fJnLJSBosHnsRsUorL0de_ysJjkZyS0s,7687
+dataeval/detectors/drift/_uncertainty.py,sha256=WJBlMAPBKD1qRCc0lxkKIqux4tPdOT4p-rwhD6Vuu2Q,5703
+dataeval/detectors/drift/updates.py,sha256=Btu2iaZW7fbO59G1w5v3ykFot0YPzy2U6VjF0d440VE,2195
+dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
+dataeval/detectors/linters/duplicates.py,sha256=om2d_wR3vtzI6CG_Apu74T9FMXllss99H0ELz_JFADQ,4935
+dataeval/detectors/linters/outliers.py,sha256=1eiVrM_A-glZWw2-ISy0JYkM_Ki9JIuRnTVa-eXwQi0,9042
+dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
+dataeval/detectors/ood/ae.py,sha256=hB2iV8YhYceJUMPyop5048eB78hUrAkRCnyOyJY5-8o,2949
+dataeval/detectors/ood/base.py,sha256=I2gW8cRWR-eBSI2zwESDrnYUEsMlhRsnWJWVyw4Jgkg,3047
+dataeval/detectors/ood/metadata_ood_mi.py,sha256=7_Sdzf7-x1TlrIQvSyOIB98C8_UQhUwmwFQmZ9_q1Uc,4042
+dataeval/detectors/ood/mixin.py,sha256=jc_mrtCRmeV51veiyD48sBxNc70-_MBT6ugNIB7D2W8,5431
+dataeval/detectors/ood/vae.py,sha256=dEEf0TSnLl6xs80LEq0CEUlFvXIUtbOwoUNlnF8ig6g,2260
+dataeval/metadata/__init__.py,sha256=B5Ix4T75UPEqY0rofaJlbRf8zCqx8yWLdE3Jo9cALHc,262
+dataeval/metadata/_distance.py,sha256=zcuGFY4Zymp5U1S0OR9p1JT5zjqO0sAkmWfY5lxb9VY,4898
+dataeval/metadata/_ood.py,sha256=k-7v8ZHdTC2TUCr07B1MtKIkGIHpPFJRKcH0Rey4pfY,8010
+dataeval/metadata/_utils.py,sha256=r8qBJT83RblobD5W5zyTVi6vYi51Dwkqswizdbzss-M,1169
+dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
+dataeval/metrics/bias/__init__.py,sha256=1yTLmgiu1kwT_7ZWcjOUbj8R0NJ0DjGoCuWdA0_T7kc,683
+dataeval/metrics/bias/_balance.py,sha256=x0daiY0TaiuanwxIbOPm7_0ksepE25nGULLrOotWqMU,5927
+dataeval/metrics/bias/_coverage.py,sha256=PeUoOiaghUEdn6Ov8z2-am7-fnBVIPcFbJK7Ty5JObA,3647
+dataeval/metrics/bias/_diversity.py,sha256=JvLN6tGIcGpRfQt4INdEyPQwU8OOLSiosPnMnh6RDd0,5668
+dataeval/metrics/bias/_parity.py,sha256=heQr_CUcdhHU9x7kT3FtF8w30IEKsok798dRW5jOUGA,11384
+dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
+dataeval/metrics/estimators/_ber.py,sha256=TrZNO1frRldUDICLzaQGt9wuMiqmvsUFdkZ3cIVv9W4,5344
+dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
+dataeval/metrics/estimators/_divergence.py,sha256=QDWl1lyAYoO9D3Ho7qOHSk6ud8Gi2MGuXEsYwO1HxvA,4043
+dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97FIu9VXA,1928
+dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
+dataeval/metrics/stats/_base.py,sha256=VlsoHatQBJ4XVNO8pMHUl-2NCC39fknWsQvs7dimMNA,10838
+dataeval/metrics/stats/_boxratiostats.py,sha256=K0hkPuLYHHZJEJG8MOPEGcY7ASsRQLKpj7V7yy4-xAc,6341
+dataeval/metrics/stats/_dimensionstats.py,sha256=73mFP-Myxne0peFliwvTntc0kk4cpq0krzMvSLDSIMM,2702
+dataeval/metrics/stats/_hashstats.py,sha256=gp9X_pnTT3mPH9YNrWLdn2LQPK_epJ3dQRoyOCwmKlg,4758
+dataeval/metrics/stats/_imagestats.py,sha256=Usxuc7_TJVNCm5SnwV6oYfoD333HQ6c4xjdth3N0b6Y,3000
+dataeval/metrics/stats/_labelstats.py,sha256=PtGyqj4RHw0cyLAWAR9FzZGqgA81AtxLGHZiuMAL2h0,4100
+dataeval/metrics/stats/_pixelstats.py,sha256=SfergRbjNJE4h0xqe-0c8RnKtZmEkZ9MwExdipLSGvg,3247
+dataeval/metrics/stats/_visualstats.py,sha256=cq4AbF2B50Ihbzb86FphcnKQ1TSwNnP3PsnbpiPQZWw,3698
+dataeval/outputs/__init__.py,sha256=sXWjCvB4-uFMUGFHGEhZxp7jid39xN5AOWbFMQ2SPeE,1419
+dataeval/outputs/_base.py,sha256=PKu-jo6jxuuCHFFpY147wxSI7vp-6LlKVSTudYJ86GQ,4817
+dataeval/outputs/_bias.py,sha256=aTsDeG48LAW_Z5kcKpJzcP4NpwQSNveZeG51sCHUcQo,12171
+dataeval/outputs/_drift.py,sha256=gOiu2C-ERTWiRqlP0auMYxPBGdm9HecWPqWfg7I4tZg,2015
+dataeval/outputs/_estimators.py,sha256=a2oAIxxEDZ9WLGfMWH8KD-BVUS_SnULRPR-iI9hFPoQ,3047
+dataeval/outputs/_linters.py,sha256=kbzJne6ZFu795JVBK_p7jv_U72cqErhKXHsitzYBgPE,6396
+dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
+dataeval/outputs/_stats.py,sha256=NrINChtF3D7Nq6VeXzhTk7ZszDWnluZ4HhLELXeK4xw,13010
+dataeval/outputs/_utils.py,sha256=HHlGC7sk416m_3Bgn075Qdblz_aPup_UOafJpB0RuXY,893
+dataeval/outputs/_workflows.py,sha256=MkRD6ubI4NCBXb9v3kjXy64cUGs3G-JKkBdOpRD9XVE,10750
+dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+dataeval/typing.py,sha256=YNSCZ6V39JaPoXwOEexIOIejTjoCQTE9UA_DbHyW34o,5824
+dataeval/utils/__init__.py,sha256=T8F8zJh4ZAeu0wDzfpld92I2zJg9mWBmkGCHrDPU7gk,264
+dataeval/utils/_array.py,sha256=fc04sYShIdsRS4qtG1UCnlGGk-yVRxlOHTNAmW7NpDY,4990
+dataeval/utils/_bin.py,sha256=nylthmsC3vzLHLhlUMACvZs--h7xvAh9Pt75InaQJW8,7322
+dataeval/utils/_clusterer.py,sha256=fw5x-2QN0TIbiodDKHZxRgxKHINedpPcOklzce0Rbjg,5436
+dataeval/utils/_fast_mst.py,sha256=4_7ykVihCL5jWtxcGnrecIsDQo65kUml9SZ1JxgBZYY,7172
+dataeval/utils/_image.py,sha256=capzF_X5H0jy0PmTP3Hf52GFgLqrnfU6gS4tiwck9jo,1939
+dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
+dataeval/utils/_mst.py,sha256=gXjUUhz9G4wkcCUTqQ-61Ti9sZUFx08hEjlZXWiEmPc,2163
+dataeval/utils/_plot.py,sha256=mTRQNbJsA42QMiOwZbJaH8sNYgP996QFDEGVVE9HSgY,7076
+dataeval/utils/data/__init__.py,sha256=vldQ2ZXl8gnI3s4vAGqUUVi6dc_R58F3JMSpbCOyFRI,820
+dataeval/utils/data/_dataset.py,sha256=tjZUJnxj9IY71GKqdKltrwufkn0EC0S3a6ylrW5Bc2s,7756
+dataeval/utils/data/_embeddings.py,sha256=6yMzMT7tHRLaepuHVXom_ffvSwxatjppQZYJj1uKoe0,3565
+dataeval/utils/data/_images.py,sha256=pv_vvpH8hWxPgLvjeVC2mZiyZivZFNLARNIOXam5ceY,1984
+dataeval/utils/data/_metadata.py,sha256=Naxzf68V5_8oYYXRfO99-86LQQYwopr6Q_hfzCD5oZ4,13841
+dataeval/utils/data/_selection.py,sha256=vpiYK6UqOXabqeXuhLnRuGJAOtw3ErqpxSc4bK7B7c0,4202
+dataeval/utils/data/_split.py,sha256=ap_h52ncVev87VFbn2_WUHb0ZreUQQYokqz1SLF8-uI,18346
+dataeval/utils/data/_targets.py,sha256=ws5d9wRiDkIuOV7GSAKNxzgSm6AWTgb0BFroQK5nAmM,3057
+dataeval/utils/data/collate.py,sha256=Z5nmBnWV_IoJzMp_tj8RCKjMJA9sSCY_zZITqISGixc,3865
+dataeval/utils/data/datasets/__init__.py,sha256=jBrswiERrvBx4pJQJZIq_B5UE-Wy8a2_SBfM2crG8R8,511
+dataeval/utils/data/datasets/_base.py,sha256=lvC13xCy6DDlDBSu5PSdU73ySyeBde2Q91NxrMtos_s,8732
+dataeval/utils/data/datasets/_cifar10.py,sha256=tDALqZKKcUbG6dHZm4MXQMhUCS1W_U-aC9aOu4fjxcM,5160
+dataeval/utils/data/datasets/_fileio.py,sha256=SixIk5nIlIwJdX9zjNXS10vHA3hL8aaYbqHsDg1xSpY,6447
+dataeval/utils/data/datasets/_milco.py,sha256=8FLMNn1qI92zGlsB3sAqDTATdK807wqQNCCiSL5lBOM,6068
+dataeval/utils/data/datasets/_mixin.py,sha256=FJgZP_cpJkgAHA3j3ai_j3Wt7aFSEjIMVmt9NpvVXzg,1757
+dataeval/utils/data/datasets/_mnist.py,sha256=PmEEyhJ50Wo4utCYYDJUSgY69FCvXgAhndSCTjfrWN0,7223
+dataeval/utils/data/datasets/_ships.py,sha256=b521rr8OYuBBTHerBtIdYRTGnEs-E8AOZ2hsa2BvkN4,4361
+dataeval/utils/data/datasets/_types.py,sha256=OOxgMjX3QnB-M2O_NqBOB9xsU26vmJO1CVZLksgiuSY,1203
+dataeval/utils/data/datasets/_voc.py,sha256=_fqcwi7AQXmCkYomLe-4_u6wRAWCkBKYNd3HJGnINsY,13837
+dataeval/utils/data/selections/__init__.py,sha256=RLjkIh2IAvPktLbUmyLv3p-rvDEaBAdWzjiNnnhVtn8,481
+dataeval/utils/data/selections/_classfilter.py,sha256=jO_N7AmPMpkMW82Nrk6FU8hcOlxX-0vmmVeUZGU9Lzc,2295
+dataeval/utils/data/selections/_indices.py,sha256=QdLgXN7GABCvGPYe28PV1RAc_RSP_nZOyCvEpKRBdWg,636
+dataeval/utils/data/selections/_limit.py,sha256=ECvHRsp7OF4LZw2tE4sGqqJ085kjC-hd2c7QDMfvXr8,518
+dataeval/utils/data/selections/_reverse.py,sha256=6SWpELC9Wgx-kPqzhDrPNn4NKU6FqDJveLrxV4D2Ypk,374
+dataeval/utils/data/selections/_shuffle.py,sha256=U2dQPlX5JQhLjpqlk_uztks8G0H_GAl2DOl6ADNJaDY,581
+dataeval/utils/metadata.py,sha256=X8Hu4LdCzAaE9uk1hI4BflmFve_VOQCqK9lXq0sk9ow,14196
+dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
+dataeval/utils/torch/_blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
+dataeval/utils/torch/_gmm.py,sha256=XBHNLPTtLGRrzq0B4GI48Sha7YHL-0PpXil3s3exLGE,3714
+dataeval/utils/torch/_internal.py,sha256=5BYibQvvXS-trsHi2x7gjxuaknLwSyj6yWXbOFEdx-M,5790
+dataeval/utils/torch/models.py,sha256=hmroEs6C6jQ5tAoZa71RFeIvXLxfXrTJSFH_jG2LGQU,9749
+dataeval/utils/torch/trainer.py,sha256=Qay0LK63RuyoGYiJ5zI2C5BVym309ORvp6shhpcrIU4,5589
+dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
+dataeval/workflows/sufficiency.py,sha256=mjKmfRrAjShLUFIARv5o8yT5fnFvDsS5Qu6ujIPUgQg,8497
+dataeval-0.82.0.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
+dataeval-0.82.0.dist-info/METADATA,sha256=pBBKLbnfGy4_THG8YbCZHH4RY2HPbLrDAEWhuL2mCtQ,5304
+dataeval-0.82.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+dataeval-0.82.0.dist-info/RECORD,,

dataeval/detectors/linters/clusterer.py DELETED Viewed

@@ -1,512 +0,0 @@
-from __future__ import annotations
-__all__ = []
-from dataclasses import dataclass
-from typing import Any, Iterable, NamedTuple, cast
-import numpy as np
-from numpy.typing import ArrayLike, NDArray
-from scipy.cluster.hierarchy import linkage
-from scipy.spatial.distance import pdist, squareform
-from dataeval.interop import to_numpy
-from dataeval.output import Output, set_metadata
-from dataeval.utils.shared import flatten
-@dataclass(frozen=True)
-class ClustererOutput(Output):
-    """
-    Output class for :class:`Clusterer` lint detector.
-    Attributes
-    ----------
-    outliers : List[int]
-        Indices that do not fall within a cluster
-    potential_outliers : List[int]
-        Indices which are near the border between belonging in the cluster and being an outlier
-    duplicates : List[List[int]]
-        Groups of indices that are exact :term:`duplicates<Duplicates>`
-    potential_duplicates : List[List[int]]
-        Groups of indices which are not exact but closely related data points
-    """
-    outliers: list[int]
-    potential_outliers: list[int]
-    duplicates: list[list[int]]
-    potential_duplicates: list[list[int]]
-def _extend_linkage(link_arr: NDArray) -> NDArray:
-    """
-    Adds a column to the linkage matrix link_arr that tracks the new id assigned
-    to each row
-    Parameters
-    ----------
-    link_arr : NDArray
-        linkage matrix
-    Returns
-    -------
-    NDArray
-        linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
-    """
-    # Adjusting linkage matrix to accommodate renumbering
-    rows, cols = link_arr.shape
-    arr = np.zeros((rows, cols + 1))
-    arr[:, :-1] = link_arr
-    arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
-    return arr
-class _Cluster:
-    __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
-    def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
-        self.merged = merged
-        self.samples = np.array(samples, dtype=np.int32)
-        self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
-        self.is_copy = is_copy
-        dist = float(self.sample_dist[-1])
-        self.count = len(self.samples)
-        if is_copy:
-            self.dist_avg = 0.0
-            self.dist_std = 0.0
-            self.out1 = False
-            self.out2 = False
-        else:
-            self.dist_avg = float(np.mean(self.sample_dist))
-            self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
-            out1 = self.dist_avg + self.dist_std
-            out2 = out1 + self.dist_std
-            self.out1 = dist > out1
-            self.out2 = dist > out2
-    def copy(self) -> _Cluster:
-        return _Cluster(False, self.samples, self.sample_dist, True)
-    def __repr__(self) -> str:
-        _params = {
-            "merged": self.merged,
-            "samples": self.samples,
-            "sample_dist": self.sample_dist,
-            "is_copy": self.is_copy,
-        }
-        return f"{self.__class__.__name__}(**{repr(_params)})"
-class _Clusters(dict[int, dict[int, _Cluster]]):
-    def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
-        super().__init__(*args)
-        self.max_level: int = 1
-class _ClusterPosition(NamedTuple):
-    """Keeps track of a cluster's level and ID"""
-    level: int
-    cid: int
-class _ClusterMergeEntry:
-    __slots__ = "level", "outer_cluster", "inner_cluster", "status"
-    def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
-        self.level = level
-        self.outer_cluster = outer_cluster
-        self.inner_cluster = inner_cluster
-        self.status = status
-    def __lt__(self, value: _ClusterMergeEntry) -> bool:
-        return self.level.__lt__(value.level)
-    def __gt__(self, value: _ClusterMergeEntry) -> bool:
-        return self.level.__gt__(value.level)
-class Clusterer:
-    """
-    Uses hierarchical clustering to flag dataset properties of interest like outliers \
-    and :term:`duplicates<Duplicates>`.
-    Parameters
-    ----------
-    dataset : ArrayLike, shape - (N, P)
-        A dataset in an ArrayLike format.
-        Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
-    Warning
-    -------
-    The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
-    Note
-    ----
-    The Clusterer works best when the length of the feature dimension, P, is less than 500.
-    If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
-    """
-    def __init__(self, dataset: ArrayLike) -> None:
-        # Allows an update to dataset to reset the state rather than instantiate a new class
-        self._on_init(dataset)
-    def _on_init(self, dataset: ArrayLike):
-        self._data: NDArray[Any] = flatten(to_numpy(dataset))
-        self._validate_data(self._data)
-        self._num_samples = len(self._data)
-        self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
-        self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
-        self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
-        self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
-        min_num = int(self._num_samples * 0.05)
-        self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
-        self._clusters: _Clusters | None = None
-        self._last_good_merge_levels: dict[int, int] | None = None
-    @property
-    def data(self) -> NDArray[Any]:
-        return self._data
-    @data.setter
-    def data(self, x: ArrayLike) -> None:
-        self._on_init(x)
-    @property
-    def clusters(self) -> _Clusters:
-        if self._clusters is None:
-            self._clusters = self._create_clusters()
-        return self._clusters
-    @property
-    def last_good_merge_levels(self) -> dict[int, int]:
-        if self._last_good_merge_levels is None:
-            self._last_good_merge_levels = self._get_last_merge_levels()
-        return self._last_good_merge_levels
-    @classmethod
-    def _validate_data(cls, x: NDArray):
-        """Checks that the data has the correct size, shape, and format"""
-        if not isinstance(x, np.ndarray):
-            raise TypeError(f"Data should be of type NDArray; got {type(x)}")
-        if x.ndim != 2:
-            raise ValueError(
-                f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
-            )
-        samples, features = x.shape  # Due to above check, we know shape has a length of 2
-        if samples < 2:
-            raise ValueError(f"Data should have at least 2 samples; got {samples}")
-        if features < 1:
-            raise ValueError(f"Samples should have at least 1 feature; got {features}")
-    def _create_clusters(self) -> _Clusters:
-        """Generates clusters based on linkage matrix"""
-        next_cluster_id = 0
-        cluster_map: dict[int, _ClusterPosition] = {}  # Dictionary to associate new cluster ids with actual clusters
-        clusters: _Clusters = _Clusters()
-        # Walking through the linkage array to generate clusters
-        for arr_i in self._larr:
-            left_id = int(arr_i[0])
-            right_id = int(arr_i[1])
-            sample_dist = np.array([arr_i[2]], dtype=np.float32)
-            merged = False
-            # Determine if the id is already associated with a cluster
-            left = cluster_map.get(left_id)
-            right = cluster_map.get(right_id)
-            if left and right:
-                merged = max([left.cid, right.cid])
-                lc = clusters[left.level][left.cid]
-                rc = clusters[right.level][right.cid]
-                left_first = len(lc.samples) >= len(rc.samples)
-                samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
-                sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
-                level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
-                # Only tracking the levels in which clusters merge for the cluster distance matrix
-                clusters.max_level = max(clusters.max_level, left.level, right.level)
-                # Update clusters to include previously skipped levels
-                clusters = self._fill_levels(clusters, left, right)
-            elif left or right:
-                child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
-                cc = clusters[child.level][child.cid]
-                samples = np.concatenate([cc.samples, [other_id]])
-                sample_dist = np.concatenate([cc.sample_dist, sample_dist])
-                level, cid = child.level + 1, child.cid
-            else:
-                samples = np.array([left_id, right_id], dtype=np.int32)
-                level, cid = 0, next_cluster_id
-                next_cluster_id += 1
-            # Set the cluster and associate the linkage id with the cluster
-            if level not in clusters:
-                clusters[level] = {}
-            clusters[level][cid] = _Cluster(merged, samples, sample_dist)
-            cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
-        return clusters
-    def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
-        # Sets each level's cluster info if it does not exist
-        if left.level != right.level:
-            (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
-            cluster = clusters[level][cid].copy()
-            for level_id in range(max_level, level, -1):
-                clusters[level_id].setdefault(cid, cluster)
-        return clusters
-    def _get_cluster_distances(self) -> NDArray:
-        """Calculates the minimum distances between clusters are each level"""
-        # Cluster distance matrix
-        max_level = self.clusters.max_level
-        cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
-        for level, cluster_set in self.clusters.items():
-            if level < max_level:
-                cluster_ids = sorted(cluster_set.keys())
-                for i, cluster_id in enumerate(cluster_ids):
-                    cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
-                    for int_id in range(i + 1, len(cluster_ids)):
-                        compare_id = cluster_ids[int_id]
-                        sample_a = self.clusters[level][cluster_id].samples
-                        sample_b = self.clusters[level][compare_id].samples
-                        min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
-                        cluster_matrix[level, cluster_id, compare_id] = min_mat
-                        cluster_matrix[level, compare_id, cluster_id] = min_mat
-        return cluster_matrix
-    def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
-        """
-        Determine what clusters should be merged and return their indices
-        """
-        intra_max_uniques = np.unique(intra_max)
-        intra_log_values = np.log(intra_max_uniques)
-        two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
-        merge_value = np.log(merge_mean)
-        # Mask of indices we know we want to merge
-        desired_merge = merge_value < two_std_all
-        # List[Values] for indices we might want to merge
-        check = merge_value[~desired_merge]
-        # Check distance from value to 2 stds of all values
-        check = np.abs((check - two_std_all) / two_std_all)
-        # Mask List[Values < 1]
-        mask = check < 1
-        one_std_check = check[mask].mean() + check[mask].std()
-        # Mask of indices that should also be merged
-        mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
-        mask2 = mask2_vals < one_std_check
-        return np.logical_or(desired_merge, mask2)
-    def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
-        """
-        Runs through the clusters dictionary determining when clusters merge,
-        and how close are those clusters when they merge.
-        Parameters
-        ----------
-        cluster_matrix:
-            The distance matrix for all clusters to all others
-        Returns
-        -------
-        List[ClusterMergeEntry]:
-            A list with each cluster's merge history
-        """
-        intra_max = []
-        merge_mean = []
-        merge_list: list[_ClusterMergeEntry] = []
-        for level, cluster_set in self.clusters.items():
-            for outer_cluster, cluster in cluster_set.items():
-                inner_cluster = cluster.merged
-                if not inner_cluster:
-                    continue
-                # Extract necessary information
-                num_samples = len(cluster.samples)
-                out1 = cluster.out1
-                out2 = cluster.out2
-                # If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
-                aggregate_func = (
-                    np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
-                )
-                distances = cluster_matrix[:level, outer_cluster, inner_cluster]
-                intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
-                positive_mask = intra_distance >= 0
-                intra_filtered = intra_distance[positive_mask]
-                # TODO: Append now, take max over axis later?
-                intra_max.append(np.max(intra_filtered))
-                # Calculate the corresponding distance stats
-                distance_stats_arr = aggregate_func(distances)
-                merge_mean.append(distance_stats_arr)
-                merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
-        all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
-        for i, is_mergeable in enumerate(all_merge_indices):
-            merge_list[i].status = is_mergeable
-        merge_list = sorted(merge_list, reverse=True)
-        return merge_list
-    def _get_last_merge_levels(self) -> dict[int, int]:
-        """
-        Creates a dictionary for important cluster ids mapped to their last good merge level
-        Returns
-        -------
-        Dict[int, int]
-            A mapping of a cluster id to its last good merge level
-        """
-        last_merge_levels: dict[int, int] = {}
-        if self._max_clusters <= 1:
-            last_merge_levels = {0: int(self._num_samples * 0.1)}
-        else:
-            cluster_matrix = self._get_cluster_distances()
-            merge_list = self._generate_merge_list(cluster_matrix)
-            for entry in merge_list:
-                if not entry.status:
-                    if entry.outer_cluster not in last_merge_levels:
-                        last_merge_levels[entry.outer_cluster] = 0
-                    if entry.inner_cluster not in last_merge_levels:
-                        last_merge_levels[entry.inner_cluster] = 0
-                    if last_merge_levels[entry.outer_cluster] > entry.level:
-                        last_merge_levels[entry.outer_cluster] = entry.level - 1
-                else:
-                    if entry.outer_cluster in last_merge_levels:
-                        last_merge_levels[entry.outer_cluster] = max(
-                            last_merge_levels[entry.outer_cluster], entry.level
-                        )
-        return last_merge_levels
-    def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
-        """
-        Retrieves Outliers based on when the sample was added to the cluster
-        and how far it was from the cluster when it was added
-        Parameters
-        ----------
-        last_merge_levels : Dict[int, int]
-            A mapping of a cluster id to its last good merge level
-        Returns
-        -------
-        Tuple[List[int], List[int]]
-            The outliers and possible outliers as sorted lists of indices
-        """
-        outliers = set()
-        possible_outliers = set()
-        already_seen = set()
-        last_level = {}
-        for level, cluster_set in self.clusters.items():
-            for cluster_id, cluster in cluster_set.items():
-                if cluster_id in last_merge_levels:
-                    last_level[cluster_id] = level
-        for level, cluster_set in self.clusters.items():
-            for cluster_id, cluster in cluster_set.items():
-                if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
-                    if cluster_id in already_seen and cluster.samples[-1] not in outliers:
-                        outliers.add(cluster.samples[-1])
-                    elif cluster.out2:
-                        if len(cluster.samples) < self._min_num_samples_per_cluster:
-                            outliers.update(cluster.samples.tolist())
-                        elif cluster.samples[-1] not in outliers:
-                            outliers.add(cluster.samples[-1])
-                        if cluster_id not in already_seen:
-                            already_seen.add(cluster_id)
-                    elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
-                        possible_outliers.add(cluster.samples[-1])
-                    elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
-                        outliers.update(cluster.samples.tolist())
-        return sorted(outliers), sorted(possible_outliers)
-    def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
-        """Merges and sorts groups of indices that share any common index"""
-        groups: list[list[int]] = []
-        for indices in zip(*index_groups):
-            indices = set(indices)
-            temp = []
-            for group in groups:
-                if not set(group).isdisjoint(indices):
-                    indices.update(group)
-                else:
-                    temp.append(group)
-            temp.append(sorted(indices))
-            groups = temp
-        return sorted(groups)
-    def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
-        """
-        Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
-        Parameters
-        ----------
-        last_merge_levels : Dict[int, int]
-            A mapping of a cluster id to its last good merge level
-        Returns
-        -------
-        Tuple[List[List[int]], List[List[int]]]
-            The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
-        """
-        duplicates_std = []
-        for cluster_id, level in last_merge_levels.items():
-            samples = self.clusters[level][cluster_id].samples
-            if len(samples) >= self._min_num_samples_per_cluster:
-                duplicates_std.append(self.clusters[level][cluster_id].dist_std)
-        diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
-        np.fill_diagonal(diag_mask, 0)
-        diag_mask = np.triu(diag_mask)
-        exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
-        exact_indices = np.nonzero(exact_mask & diag_mask)
-        exact_dupes = self._sorted_union_find(exact_indices)
-        near_mask = self._sqdmat <= np.mean(duplicates_std)
-        near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
-        near_dupes = self._sorted_union_find(near_indices)
-        return exact_dupes, near_dupes
-    # TODO: Move data input to evaluate from class
-    @set_metadata(state=["data"])
-    def evaluate(self) -> ClustererOutput:
-        """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
-        Returns
-        -------
-        ClustererOutput
-            The Outliers and duplicate indices found in the data
-        Example
-        -------
-        >>> cluster = Clusterer(clusterer_images)
-        >>> cluster.evaluate()
-        ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
-        """  # noqa: E501
-        outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
-        duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
-        return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)

dataeval/detectors/linters/merged_stats.py DELETED Viewed

@@ -1,49 +0,0 @@
-from __future__ import annotations
-__all__ = []
-from copy import deepcopy
-from typing import Sequence, TypeVar
-import numpy as np
-from dataeval.metrics.stats.base import BaseStatsOutput
-TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
-def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
-    if type(a) is not type(b):
-        raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
-    sum_dict = deepcopy(a.dict())
-    for k in sum_dict:
-        if isinstance(sum_dict[k], list):
-            sum_dict[k].extend(b.dict()[k])
-        else:
-            sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
-    return type(a)(**sum_dict)
-def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
-    output = None
-    dataset_steps = []
-    cur_len = 0
-    for s in stats:
-        output = s if output is None else add_stats(output, s)
-        cur_len += len(s)
-        dataset_steps.append(cur_len)
-    if output is None:
-        raise TypeError("Cannot combine empty sequence of stats.")
-    return output, dataset_steps
-def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
-    last_step = 0
-    for i, step in enumerate(dataset_steps):
-        if idx < step:
-            return i, idx - last_step
-        last_step = step
-    return -1, idx

dataeval 0.76.1__py3-none-any.whl → 0.82.0__py3-none-any.whl

dataeval 0.76.1py3-none-any.whl → 0.82.0py3-none-any.whl