dataeval 0.81.0__py3-none-any.whl → 0.82.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/config.py +68 -11
  3. dataeval/detectors/drift/__init__.py +2 -2
  4. dataeval/detectors/drift/_base.py +8 -64
  5. dataeval/detectors/drift/_mmd.py +12 -38
  6. dataeval/detectors/drift/_torch.py +7 -7
  7. dataeval/detectors/drift/_uncertainty.py +6 -5
  8. dataeval/detectors/drift/updates.py +20 -3
  9. dataeval/detectors/linters/__init__.py +3 -2
  10. dataeval/detectors/linters/duplicates.py +14 -46
  11. dataeval/detectors/linters/outliers.py +25 -159
  12. dataeval/detectors/ood/__init__.py +1 -1
  13. dataeval/detectors/ood/ae.py +6 -5
  14. dataeval/detectors/ood/base.py +2 -2
  15. dataeval/detectors/ood/metadata_ood_mi.py +4 -6
  16. dataeval/detectors/ood/mixin.py +3 -4
  17. dataeval/detectors/ood/vae.py +3 -2
  18. dataeval/metadata/__init__.py +2 -1
  19. dataeval/metadata/_distance.py +134 -0
  20. dataeval/metadata/_ood.py +30 -49
  21. dataeval/metadata/_utils.py +44 -0
  22. dataeval/metrics/bias/__init__.py +5 -4
  23. dataeval/metrics/bias/_balance.py +17 -149
  24. dataeval/metrics/bias/_coverage.py +4 -106
  25. dataeval/metrics/bias/_diversity.py +12 -107
  26. dataeval/metrics/bias/_parity.py +7 -71
  27. dataeval/metrics/estimators/__init__.py +5 -4
  28. dataeval/metrics/estimators/_ber.py +2 -20
  29. dataeval/metrics/estimators/_clusterer.py +1 -61
  30. dataeval/metrics/estimators/_divergence.py +2 -19
  31. dataeval/metrics/estimators/_uap.py +2 -16
  32. dataeval/metrics/stats/__init__.py +15 -12
  33. dataeval/metrics/stats/_base.py +41 -128
  34. dataeval/metrics/stats/_boxratiostats.py +13 -13
  35. dataeval/metrics/stats/_dimensionstats.py +17 -58
  36. dataeval/metrics/stats/_hashstats.py +19 -35
  37. dataeval/metrics/stats/_imagestats.py +94 -0
  38. dataeval/metrics/stats/_labelstats.py +42 -121
  39. dataeval/metrics/stats/_pixelstats.py +19 -51
  40. dataeval/metrics/stats/_visualstats.py +19 -51
  41. dataeval/outputs/__init__.py +57 -0
  42. dataeval/outputs/_base.py +182 -0
  43. dataeval/outputs/_bias.py +381 -0
  44. dataeval/outputs/_drift.py +83 -0
  45. dataeval/outputs/_estimators.py +114 -0
  46. dataeval/outputs/_linters.py +186 -0
  47. dataeval/outputs/_metadata.py +54 -0
  48. dataeval/{detectors/ood/output.py → outputs/_ood.py} +22 -22
  49. dataeval/outputs/_stats.py +393 -0
  50. dataeval/outputs/_utils.py +44 -0
  51. dataeval/outputs/_workflows.py +364 -0
  52. dataeval/typing.py +187 -7
  53. dataeval/utils/_method.py +1 -5
  54. dataeval/utils/_plot.py +2 -2
  55. dataeval/utils/data/__init__.py +5 -1
  56. dataeval/utils/data/_dataset.py +217 -0
  57. dataeval/utils/data/_embeddings.py +12 -14
  58. dataeval/utils/data/_images.py +30 -27
  59. dataeval/utils/data/_metadata.py +28 -11
  60. dataeval/utils/data/_selection.py +25 -22
  61. dataeval/utils/data/_split.py +5 -29
  62. dataeval/utils/data/_targets.py +14 -2
  63. dataeval/utils/data/datasets/_base.py +5 -5
  64. dataeval/utils/data/datasets/_cifar10.py +1 -1
  65. dataeval/utils/data/datasets/_milco.py +1 -1
  66. dataeval/utils/data/datasets/_mnist.py +1 -1
  67. dataeval/utils/data/datasets/_ships.py +1 -1
  68. dataeval/utils/data/{_types.py → datasets/_types.py} +10 -16
  69. dataeval/utils/data/datasets/_voc.py +1 -1
  70. dataeval/utils/data/selections/_classfilter.py +4 -5
  71. dataeval/utils/data/selections/_indices.py +2 -2
  72. dataeval/utils/data/selections/_limit.py +2 -2
  73. dataeval/utils/data/selections/_reverse.py +2 -2
  74. dataeval/utils/data/selections/_shuffle.py +2 -2
  75. dataeval/utils/torch/_internal.py +5 -5
  76. dataeval/utils/torch/trainer.py +8 -8
  77. dataeval/workflows/__init__.py +2 -1
  78. dataeval/workflows/sufficiency.py +6 -342
  79. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/METADATA +2 -2
  80. dataeval-0.82.1.dist-info/RECORD +105 -0
  81. dataeval/_output.py +0 -137
  82. dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  83. dataeval/metrics/stats/_datasetstats.py +0 -198
  84. dataeval-0.81.0.dist-info/RECORD +0 -94
  85. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/LICENSE.txt +0 -0
  86. {dataeval-0.81.0.dist-info → dataeval-0.82.1.dist-info}/WHEEL +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.81.0
3
+ Version: 0.82.1
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -74,7 +74,7 @@ DataEval is easy to install, supports a wide range of Python versions, and is
74
74
  compatible with many of the most popular packages in the scientific and T&E
75
75
  communities.
76
76
 
77
- DataEval also has native interopability between JATIC's suite of tools when
77
+ DataEval also has native interoperability between JATIC's suite of tools when
78
78
  using MAITE-compliant datasets and models.
79
79
  <!-- end JATIC interop -->
80
80
 
@@ -0,0 +1,105 @@
1
+ dataeval/__init__.py,sha256=5VDv2s5EVQIekwUGMu3Bj2p38NdJToNibRA5do5sjwQ,1510
2
+ dataeval/_log.py,sha256=Mn5bRWO0cgtAYd5VGYSFiPgu57ta3zoktrtHAZ1m3dU,357
3
+ dataeval/config.py,sha256=lAOPOPwXHsMXTmZPpCXvWJ1xBg9HY8e0dRsa5MsIKws,3459
4
+ dataeval/detectors/__init__.py,sha256=3Sg-XWlwr75zEEH3hZKA4nWMtGvaRlnfzTWvZG_Ak6U,189
5
+ dataeval/detectors/drift/__init__.py,sha256=6is_XBtG1d-vUbhHvqXGOdnAwxJ7NA5yRfURn7pCeIw,651
6
+ dataeval/detectors/drift/_base.py,sha256=mJdKvyROgWvz-p1VlAIJqUI6BAj9ss8riUvR5An5wIw,13459
7
+ dataeval/detectors/drift/_cvm.py,sha256=H2w-I0eMD7yP-CSmpdodeJ0-TYznJT7w_H7JuobESow,3859
8
+ dataeval/detectors/drift/_ks.py,sha256=-5k3RBPA3kadX7oD14Wc52rAqQf1udwFeW7Qf3Sv4Tw,4058
9
+ dataeval/detectors/drift/_mmd.py,sha256=NEXowx9UHIvmEKS8sqssw6PMLJMh0BZPhRNX1hYlkz4,7239
10
+ dataeval/detectors/drift/_torch.py,sha256=VrFCyTaRrUslFPy_mYZ4UL70LZ8faH4eHwLurZ9qqNE,7628
11
+ dataeval/detectors/drift/_uncertainty.py,sha256=O5h6_bJbeQEE660SLLP8k-EHqImmKegIcxzcnUKI7X4,5714
12
+ dataeval/detectors/drift/updates.py,sha256=Btu2iaZW7fbO59G1w5v3ykFot0YPzy2U6VjF0d440VE,2195
13
+ dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
14
+ dataeval/detectors/linters/duplicates.py,sha256=tcxniL8rRZkDdQqfuS502UmfKxS3a7iRA22Dtt_vQIk,4935
15
+ dataeval/detectors/linters/outliers.py,sha256=Hln2dPQZjF_uV2QYptA_o6ZF3ugyCImVT-XLDB2-q3A,9042
16
+ dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
17
+ dataeval/detectors/ood/ae.py,sha256=YQfhB1ShQLjM1V4uCz9Oo2tCZpOfAZ_-SBCAl4Ac67Y,2921
18
+ dataeval/detectors/ood/base.py,sha256=9b-Ljznf0lB1SXF4F_Aj3eJ4Y3ijGEDPMjucUsWOGJM,3051
19
+ dataeval/detectors/ood/metadata_ood_mi.py,sha256=aMSP3zh5EwIWqM7w135ZAuTVnpqYI4dN3tEOrx41lsk,3837
20
+ dataeval/detectors/ood/mixin.py,sha256=0_o-1HPvgf3-Lf1MSOIfjj5UB8LTLEBGYtJJfyCCzwc,5431
21
+ dataeval/detectors/ood/vae.py,sha256=Fcq0-WbLhzYCgYOAJPBklHm7yuXmFJuEpBkhgwM5kiA,2291
22
+ dataeval/metadata/__init__.py,sha256=B5Ix4T75UPEqY0rofaJlbRf8zCqx8yWLdE3Jo9cALHc,262
23
+ dataeval/metadata/_distance.py,sha256=xsXMMg1pJkHcEZ-KIlqv9YOGYVID3ELjt3-fr1QVnOs,4082
24
+ dataeval/metadata/_ood.py,sha256=DsFnxWgaa6-1FJDg_Q2ZDCeTzhfXpX8YyLvlRGwtJpw,8236
25
+ dataeval/metadata/_utils.py,sha256=r8qBJT83RblobD5W5zyTVi6vYi51Dwkqswizdbzss-M,1169
26
+ dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
27
+ dataeval/metrics/bias/__init__.py,sha256=1yTLmgiu1kwT_7ZWcjOUbj8R0NJ0DjGoCuWdA0_T7kc,683
28
+ dataeval/metrics/bias/_balance.py,sha256=yADHFrdxW6-WieXIeINqO9cy5vhSIq4tx3Q9Aa1vnTo,6143
29
+ dataeval/metrics/bias/_coverage.py,sha256=PeUoOiaghUEdn6Ov8z2-am7-fnBVIPcFbJK7Ty5JObA,3647
30
+ dataeval/metrics/bias/_diversity.py,sha256=U_l4oYjH39rON2Io0BdCIwJxxob0cKTW8bZNufG0CWs,5820
31
+ dataeval/metrics/bias/_parity.py,sha256=8JRZv4wLpxN9zTvMDlcpKgz-2nO-9eVjqccODcf2nbw,11535
32
+ dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
33
+ dataeval/metrics/estimators/_ber.py,sha256=TrZNO1frRldUDICLzaQGt9wuMiqmvsUFdkZ3cIVv9W4,5344
34
+ dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
35
+ dataeval/metrics/estimators/_divergence.py,sha256=QDWl1lyAYoO9D3Ho7qOHSk6ud8Gi2MGuXEsYwO1HxvA,4043
36
+ dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97FIu9VXA,1928
37
+ dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
38
+ dataeval/metrics/stats/_base.py,sha256=5x27-zZMIJR3HcMe_COIx1YOjlVnYK5GaTED6fifRP0,10838
39
+ dataeval/metrics/stats/_boxratiostats.py,sha256=8Kd2FTZ5PLNYZfdAjU_R385gb0Z16JY0L9H_d5ZhgQs,6341
40
+ dataeval/metrics/stats/_dimensionstats.py,sha256=73mFP-Myxne0peFliwvTntc0kk4cpq0krzMvSLDSIMM,2702
41
+ dataeval/metrics/stats/_hashstats.py,sha256=gp9X_pnTT3mPH9YNrWLdn2LQPK_epJ3dQRoyOCwmKlg,4758
42
+ dataeval/metrics/stats/_imagestats.py,sha256=gUPNgN5Zwzdr7WnSwbve1NXNsyxd5dy3cSnlR_7guCg,3007
43
+ dataeval/metrics/stats/_labelstats.py,sha256=PtGyqj4RHw0cyLAWAR9FzZGqgA81AtxLGHZiuMAL2h0,4100
44
+ dataeval/metrics/stats/_pixelstats.py,sha256=SfergRbjNJE4h0xqe-0c8RnKtZmEkZ9MwExdipLSGvg,3247
45
+ dataeval/metrics/stats/_visualstats.py,sha256=cq4AbF2B50Ihbzb86FphcnKQ1TSwNnP3PsnbpiPQZWw,3698
46
+ dataeval/outputs/__init__.py,sha256=FLtrqwjRAT6qeU-BnosWVqvIpOn6dOys18D-fJiYxKw,1609
47
+ dataeval/outputs/_base.py,sha256=aZFbgybnZSQ3ws7QYRLTbDFqUfBFRVtIwX2LZfeGFUA,5703
48
+ dataeval/outputs/_bias.py,sha256=O5RHbTUJDwkwJfz2-YoOfRb4eDl5Tg1UFVtvs025wfA,12173
49
+ dataeval/outputs/_drift.py,sha256=gOiu2C-ERTWiRqlP0auMYxPBGdm9HecWPqWfg7I4tZg,2015
50
+ dataeval/outputs/_estimators.py,sha256=a2oAIxxEDZ9WLGfMWH8KD-BVUS_SnULRPR-iI9hFPoQ,3047
51
+ dataeval/outputs/_linters.py,sha256=YOdjrfm8ypdRrqYOaPM9nc6wVJI3-ita3Haj7LHDNaw,6416
52
+ dataeval/outputs/_metadata.py,sha256=VVcGbtx9DtHQdqgab-fzuVqAhnctZr8WX93RxVNEg5Y,1502
53
+ dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
54
+ dataeval/outputs/_stats.py,sha256=PhRdyWWZxewzenFx0MxK9y9ZLE2MnMA-a4-JeSJ_Bs8,13180
55
+ dataeval/outputs/_utils.py,sha256=HHlGC7sk416m_3Bgn075Qdblz_aPup_UOafJpB0RuXY,893
56
+ dataeval/outputs/_workflows.py,sha256=MkRD6ubI4NCBXb9v3kjXy64cUGs3G-JKkBdOpRD9XVE,10750
57
+ dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
+ dataeval/typing.py,sha256=YNSCZ6V39JaPoXwOEexIOIejTjoCQTE9UA_DbHyW34o,5824
59
+ dataeval/utils/__init__.py,sha256=T8F8zJh4ZAeu0wDzfpld92I2zJg9mWBmkGCHrDPU7gk,264
60
+ dataeval/utils/_array.py,sha256=fc04sYShIdsRS4qtG1UCnlGGk-yVRxlOHTNAmW7NpDY,4990
61
+ dataeval/utils/_bin.py,sha256=nylthmsC3vzLHLhlUMACvZs--h7xvAh9Pt75InaQJW8,7322
62
+ dataeval/utils/_clusterer.py,sha256=fw5x-2QN0TIbiodDKHZxRgxKHINedpPcOklzce0Rbjg,5436
63
+ dataeval/utils/_fast_mst.py,sha256=4_7ykVihCL5jWtxcGnrecIsDQo65kUml9SZ1JxgBZYY,7172
64
+ dataeval/utils/_image.py,sha256=capzF_X5H0jy0PmTP3Hf52GFgLqrnfU6gS4tiwck9jo,1939
65
+ dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
66
+ dataeval/utils/_mst.py,sha256=gXjUUhz9G4wkcCUTqQ-61Ti9sZUFx08hEjlZXWiEmPc,2163
67
+ dataeval/utils/_plot.py,sha256=mTRQNbJsA42QMiOwZbJaH8sNYgP996QFDEGVVE9HSgY,7076
68
+ dataeval/utils/data/__init__.py,sha256=vldQ2ZXl8gnI3s4vAGqUUVi6dc_R58F3JMSpbCOyFRI,820
69
+ dataeval/utils/data/_dataset.py,sha256=tjZUJnxj9IY71GKqdKltrwufkn0EC0S3a6ylrW5Bc2s,7756
70
+ dataeval/utils/data/_embeddings.py,sha256=K33F-swjOOxrII2Oq1tUZHx_BGJtQjoSH23deKy_kqI,3528
71
+ dataeval/utils/data/_images.py,sha256=pv_vvpH8hWxPgLvjeVC2mZiyZivZFNLARNIOXam5ceY,1984
72
+ dataeval/utils/data/_metadata.py,sha256=VqeePp7NtoFFWzmIhH4fn-cjrnATpgzgzs-d73cnBXM,14370
73
+ dataeval/utils/data/_selection.py,sha256=etnEn1QsBlcmtAltjbrCXh6YwHRHKzFPp84fj6MJeGs,4058
74
+ dataeval/utils/data/_split.py,sha256=YdsqTRjKbdSfg8w0f4XgX7j0uOSdtfzvvyObAzyqgI0,18433
75
+ dataeval/utils/data/_targets.py,sha256=ws5d9wRiDkIuOV7GSAKNxzgSm6AWTgb0BFroQK5nAmM,3057
76
+ dataeval/utils/data/collate.py,sha256=Z5nmBnWV_IoJzMp_tj8RCKjMJA9sSCY_zZITqISGixc,3865
77
+ dataeval/utils/data/datasets/__init__.py,sha256=jBrswiERrvBx4pJQJZIq_B5UE-Wy8a2_SBfM2crG8R8,511
78
+ dataeval/utils/data/datasets/_base.py,sha256=lvC13xCy6DDlDBSu5PSdU73ySyeBde2Q91NxrMtos_s,8732
79
+ dataeval/utils/data/datasets/_cifar10.py,sha256=tDALqZKKcUbG6dHZm4MXQMhUCS1W_U-aC9aOu4fjxcM,5160
80
+ dataeval/utils/data/datasets/_fileio.py,sha256=SixIk5nIlIwJdX9zjNXS10vHA3hL8aaYbqHsDg1xSpY,6447
81
+ dataeval/utils/data/datasets/_milco.py,sha256=8FLMNn1qI92zGlsB3sAqDTATdK807wqQNCCiSL5lBOM,6068
82
+ dataeval/utils/data/datasets/_mixin.py,sha256=FJgZP_cpJkgAHA3j3ai_j3Wt7aFSEjIMVmt9NpvVXzg,1757
83
+ dataeval/utils/data/datasets/_mnist.py,sha256=PmEEyhJ50Wo4utCYYDJUSgY69FCvXgAhndSCTjfrWN0,7223
84
+ dataeval/utils/data/datasets/_ships.py,sha256=b521rr8OYuBBTHerBtIdYRTGnEs-E8AOZ2hsa2BvkN4,4361
85
+ dataeval/utils/data/datasets/_types.py,sha256=OOxgMjX3QnB-M2O_NqBOB9xsU26vmJO1CVZLksgiuSY,1203
86
+ dataeval/utils/data/datasets/_voc.py,sha256=_fqcwi7AQXmCkYomLe-4_u6wRAWCkBKYNd3HJGnINsY,13837
87
+ dataeval/utils/data/selections/__init__.py,sha256=RLjkIh2IAvPktLbUmyLv3p-rvDEaBAdWzjiNnnhVtn8,481
88
+ dataeval/utils/data/selections/_classfilter.py,sha256=hg6QTpUO4XqjQCwSkbFHzg9PWnaZAKcrd-HjQZqTUU0,2420
89
+ dataeval/utils/data/selections/_indices.py,sha256=QdLgXN7GABCvGPYe28PV1RAc_RSP_nZOyCvEpKRBdWg,636
90
+ dataeval/utils/data/selections/_limit.py,sha256=ECvHRsp7OF4LZw2tE4sGqqJ085kjC-hd2c7QDMfvXr8,518
91
+ dataeval/utils/data/selections/_reverse.py,sha256=6SWpELC9Wgx-kPqzhDrPNn4NKU6FqDJveLrxV4D2Ypk,374
92
+ dataeval/utils/data/selections/_shuffle.py,sha256=U2dQPlX5JQhLjpqlk_uztks8G0H_GAl2DOl6ADNJaDY,581
93
+ dataeval/utils/metadata.py,sha256=X8Hu4LdCzAaE9uk1hI4BflmFve_VOQCqK9lXq0sk9ow,14196
94
+ dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
95
+ dataeval/utils/torch/_blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
96
+ dataeval/utils/torch/_gmm.py,sha256=XBHNLPTtLGRrzq0B4GI48Sha7YHL-0PpXil3s3exLGE,3714
97
+ dataeval/utils/torch/_internal.py,sha256=23DCnF7C7N3tZgZUpT2nyH7mMb8Pi4GcnQyjK0BKHpg,5735
98
+ dataeval/utils/torch/models.py,sha256=hmroEs6C6jQ5tAoZa71RFeIvXLxfXrTJSFH_jG2LGQU,9749
99
+ dataeval/utils/torch/trainer.py,sha256=iUotX4OdirH8-ZtjdpU8gbJavkYW9YY9qpA2mAlFy1Y,5520
100
+ dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
101
+ dataeval/workflows/sufficiency.py,sha256=mjKmfRrAjShLUFIARv5o8yT5fnFvDsS5Qu6ujIPUgQg,8497
102
+ dataeval-0.82.1.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
103
+ dataeval-0.82.1.dist-info/METADATA,sha256=3HhWy9SOUtb9cCENZfd_DOpBEsVp8BZvf5zuK7r0kEM,5304
104
+ dataeval-0.82.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
105
+ dataeval-0.82.1.dist-info/RECORD,,
dataeval/_output.py DELETED
@@ -1,137 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- import inspect
6
- import logging
7
- import sys
8
- from collections.abc import Mapping
9
- from datetime import datetime, timezone
10
- from functools import partial, wraps
11
- from typing import Any, Callable, Iterator, TypeVar
12
-
13
- import numpy as np
14
-
15
- if sys.version_info >= (3, 10):
16
- from typing import ParamSpec
17
- else:
18
- from typing_extensions import ParamSpec
19
-
20
- from dataeval import __version__
21
-
22
-
23
- class Output:
24
- _name: str
25
- _execution_time: datetime
26
- _execution_duration: float
27
- _arguments: dict[str, str]
28
- _state: dict[str, str]
29
- _version: str
30
-
31
- def __str__(self) -> str:
32
- return f"{self.__class__.__name__}: {str(self.dict())}"
33
-
34
- def dict(self) -> dict[str, Any]:
35
- """
36
- Output attributes as a dictionary.
37
-
38
- Returns
39
- -------
40
- dict[str, Any]
41
- """
42
- return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
43
-
44
- def meta(self) -> dict[str, Any]:
45
- """
46
- Execution metadata as a dictionary.
47
-
48
- Returns
49
- -------
50
- dict[str, Any]
51
- """
52
- return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
53
-
54
-
55
- TKey = TypeVar("TKey", str, int, float, set)
56
- TValue = TypeVar("TValue")
57
-
58
-
59
- class MappingOutput(Mapping[TKey, TValue], Output):
60
- __slots__ = ["_data"]
61
-
62
- def __init__(self, data: Mapping[TKey, TValue]):
63
- self._data = data
64
-
65
- def __getitem__(self, key: TKey) -> TValue:
66
- return self._data.__getitem__(key)
67
-
68
- def __iter__(self) -> Iterator[TKey]:
69
- return self._data.__iter__()
70
-
71
- def __len__(self) -> int:
72
- return self._data.__len__()
73
-
74
- def dict(self) -> dict[str, TValue]:
75
- return {str(k): v for k, v in self._data.items()}
76
-
77
-
78
- P = ParamSpec("P")
79
- R = TypeVar("R", bound=Output)
80
-
81
-
82
- def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
83
- """Decorator to stamp Output classes with runtime metadata"""
84
-
85
- if fn is None:
86
- return partial(set_metadata, state=state) # type: ignore
87
-
88
- @wraps(fn)
89
- def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
90
- def fmt(v):
91
- if np.isscalar(v):
92
- return v
93
- if hasattr(v, "shape"):
94
- return f"{v.__class__.__name__}: shape={getattr(v, 'shape')}"
95
- if hasattr(v, "__len__"):
96
- return f"{v.__class__.__name__}: len={len(v)}"
97
- return f"{v.__class__.__name__}"
98
-
99
- # Collect function metadata
100
- # set all params with defaults then update params with mapped arguments and explicit keyword args
101
- fn_params = inspect.signature(fn).parameters
102
- arguments = {k: None if v.default is inspect.Parameter.empty else v.default for k, v in fn_params.items()}
103
- arguments.update(zip(fn_params, args))
104
- arguments.update(kwargs)
105
- arguments = {k: fmt(v) for k, v in arguments.items()}
106
- is_method = "self" in arguments
107
- state_attrs = {k: fmt(getattr(args[0], k)) for k in state or []} if is_method else {}
108
- module = args[0].__class__.__module__ if is_method else fn.__module__.removeprefix("src.")
109
- class_prefix = f".{args[0].__class__.__name__}." if is_method else "."
110
- name = f"{module}{class_prefix}{fn.__name__}"
111
- arguments = {k: v for k, v in arguments.items() if k != "self"}
112
-
113
- _logger = logging.getLogger(module)
114
- time = datetime.now(timezone.utc)
115
- _logger.log(logging.INFO, f">>> Executing '{name}': args={arguments} state={state} <<<")
116
-
117
- ##### EXECUTE FUNCTION #####
118
- result = fn(*args, **kwargs)
119
- ############################
120
-
121
- duration = (datetime.now(timezone.utc) - time).total_seconds()
122
- _logger.log(logging.INFO, f">>> Completed '{name}': args={arguments} state={state} duration={duration} <<<")
123
-
124
- # Update output with recorded metadata
125
- metadata = {
126
- "_name": name,
127
- "_execution_time": time,
128
- "_execution_duration": duration,
129
- "_arguments": arguments,
130
- "_state": state_attrs,
131
- "_version": __version__,
132
- }
133
- for k, v in metadata.items():
134
- object.__setattr__(result, k, v)
135
- return result
136
-
137
- return wrapper
@@ -1,129 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- import numbers
6
- import warnings
7
- from typing import Any, Mapping, NamedTuple
8
-
9
- import numpy as np
10
- from numpy.typing import NDArray
11
- from scipy.stats import iqr, ks_2samp
12
- from scipy.stats import wasserstein_distance as emd
13
-
14
- from dataeval._output import MappingOutput, set_metadata
15
-
16
-
17
- class MetadataKSResult(NamedTuple):
18
- statistic: float
19
- statistic_location: float
20
- shift_magnitude: float
21
- pvalue: float
22
-
23
-
24
- class KSOutput(MappingOutput[str, MetadataKSResult]):
25
- """
26
- Output dictionary class for results of ks_2samp featurewise comparisons of new metadata to reference metadata.
27
-
28
- Attributes
29
- ----------
30
- key: str
31
- Metadata feature names
32
- value: NamedTuple[float, float, float, float]
33
- Each value contains four floats, which are:
34
- - statistic: the KS statistic itself
35
- - statistic_location: its location within the range of the reference metadata
36
- - shift_magnitude: the shift of new metadata relative to reference
37
- - pvalue: the p-value from the KS two-sample test
38
- """
39
-
40
-
41
- @set_metadata
42
- def meta_distribution_compare(
43
- md0: Mapping[str, list[Any] | NDArray[Any]], md1: Mapping[str, list[Any] | NDArray[Any]]
44
- ) -> KSOutput:
45
- """
46
- Measures the featurewise distance between two metadata distributions, and computes a p-value to evaluate its
47
- significance.
48
-
49
- Uses the Earth Mover's Distance and the Kolmogorov-Smirnov two-sample test, featurewise.
50
-
51
- Parameters
52
- ----------
53
- md0 : Mapping[str, list[Any] | NDArray[Any]]
54
- A set of arrays of values, indexed by metadata feature names, with one value per data example per feature.
55
- md1 : Mapping[str, list[Any] | NDArray[Any]]
56
- Another set of arrays of values, indexed by metadata feature names, with one value per data example per
57
- feature.
58
-
59
- Returns
60
- -------
61
- dict[str, KstestResult]
62
- A dictionary with keys corresponding to metadata feature names, and values that are KstestResult objects, as
63
- defined by scipy.stats.ks_2samp. These values also have two additional attributes: shift_magnitude and
64
- statistic_location. The first is the Earth Mover's Distance normalized by the interquartile range (IQR) of
65
- the reference, while the second is the value at which the KS statistic has its maximum, measured in
66
- IQR-normalized units relative to the median of the reference distribution.
67
-
68
- Examples
69
- --------
70
- Imagine we have 3 data examples, and that the corresponding metadata contains 2 features called time and
71
- altitude.
72
-
73
- >>> md0 = {"time": [1.2, 3.4, 5.6], "altitude": [235, 6789, 101112]}
74
- >>> md1 = {"time": [7.8, 9.10, 11.12], "altitude": [532, 9876, 211101]}
75
- >>> md_out = meta_distribution_compare(md0, md1)
76
- >>> for k, v in md_out.items():
77
- ... print(f"{k}: { {kv: round(vv, 3) for kv, vv in v._asdict().items()} }")
78
- time: {'statistic': 1.0, 'statistic_location': 0.444, 'shift_magnitude': 2.7, 'pvalue': 0.0}
79
- altitude: {'statistic': 0.333, 'statistic_location': 0.478, 'shift_magnitude': 0.749, 'pvalue': 0.944}
80
- """
81
-
82
- if (metadata_keys := md0.keys()) != md1.keys():
83
- raise ValueError(f"Both sets of metadata keys must be identical: {list(md0)}, {list(md1)}")
84
-
85
- mdc = {} # output dict
86
- for k in metadata_keys:
87
- mdc.update({k: {}})
88
-
89
- x0, x1 = list(md0[k]), list(md1[k])
90
-
91
- allx = x0 + x1 # "+" sign concatenates lists.
92
-
93
- if not all(isinstance(allxi, numbers.Number) for allxi in allx): # NB: np.nan *is* a number in this context.
94
- continue # non-numeric features will return an empty dict for feature k
95
-
96
- # from Numerical Recipes in C, 3rd ed. p. 737. If too few points, warn and keep going.
97
- if np.sqrt(((N := len(x0)) * (M := len(x1))) / (N + M)) < 4:
98
- warnings.warn(
99
- f"Sample sizes of {N}, {M} for feature {k} will yield unreliable p-values from the KS test.",
100
- UserWarning,
101
- )
102
-
103
- xmin, xmax = min(allx), max(allx)
104
- if xmin == xmax: # only one value in this feature, so fill in the obvious results for feature k
105
- mdc[k] = MetadataKSResult(
106
- **{"statistic": 0.0, "statistic_location": 0.0, "shift_magnitude": 0.0, "pvalue": 1.0}
107
- )
108
- continue
109
-
110
- ks_result = ks_2samp(x0, x1, method="asymp")
111
- dev = ks_result.statistic_location - xmin # pyright: ignore (KSresult type)
112
- loc = dev / (xmax - xmin) if xmax > xmin else dev
113
-
114
- dX = iqr(x0) # preferred value of dX, which is the scale of the the md0 values for feature k
115
- dX = (max(x0) - min(x0)) / 2.0 if dX == 0 else dX # reasonable alternative value of dX, when iqr is zero.
116
- dX = 1.0 if dX == 0 else dX # if dX is *still* zero, just avoid division by zero this way
117
-
118
- drift = emd(x0, x1) / dX
119
-
120
- mdc[k] = MetadataKSResult(
121
- **{
122
- "statistic": ks_result.statistic, # pyright: ignore
123
- "statistic_location": loc,
124
- "shift_magnitude": drift,
125
- "pvalue": ks_result.pvalue, # pyright: ignore
126
- }
127
- )
128
-
129
- return KSOutput(mdc)
@@ -1,198 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable
7
-
8
- from dataeval._output import Output, set_metadata
9
- from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
10
- from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, DimensionStatsProcessor
11
- from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
12
- from dataeval.metrics.stats._pixelstats import PixelStatsOutput, PixelStatsProcessor
13
- from dataeval.metrics.stats._visualstats import VisualStatsOutput, VisualStatsProcessor
14
- from dataeval.typing import ArrayLike
15
- from dataeval.utils._plot import channel_histogram_plot
16
-
17
-
18
- @dataclass(frozen=True)
19
- class DatasetStatsOutput(Output, HistogramPlotMixin):
20
- """
21
- Output class for :func:`.datasetstats` stats metric.
22
-
23
- This class represents the outputs of various stats functions against a single
24
- dataset, such that each index across all stat outputs are representative of
25
- the same source image. Modifying or mixing outputs will result in inaccurate
26
- outlier calculations if not created correctly.
27
-
28
- Attributes
29
- ----------
30
- dimensionstats : DimensionStatsOutput
31
- pixelstats: PixelStatsOutput
32
- visualstats: VisualStatsOutput
33
- labelstats: LabelStatsOutput or None
34
- """
35
-
36
- dimensionstats: DimensionStatsOutput
37
- pixelstats: PixelStatsOutput
38
- visualstats: VisualStatsOutput
39
- labelstats: LabelStatsOutput | None = None
40
-
41
- _excluded_keys = ["histogram", "percentiles"]
42
-
43
- def _outputs(self) -> list[Output]:
44
- return [s for s in (self.dimensionstats, self.pixelstats, self.visualstats, self.labelstats) if s is not None]
45
-
46
- def dict(self) -> dict[str, Any]:
47
- return {k: v for o in self._outputs() for k, v in o.dict().items()}
48
-
49
- def __post_init__(self) -> None:
50
- lengths = [len(s) for s in self._outputs() if isinstance(s, BaseStatsOutput)]
51
- if not all(length == lengths[0] for length in lengths):
52
- raise ValueError("All StatsOutput classes must contain the same number of image sources.")
53
-
54
-
55
- def _get_channels(cls, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None):
56
- raw_channels = max([si.channel for si in cls.dict()["source_index"]]) + 1
57
- if isinstance(channel_index, int):
58
- max_channels = 1 if channel_index < raw_channels else raw_channels
59
- ch_mask = cls.pixelstats.get_channel_mask(channel_index)
60
- elif isinstance(channel_index, Iterable) and all(isinstance(val, int) for val in list(channel_index)):
61
- max_channels = len(list(channel_index))
62
- ch_mask = cls.pixelstats.get_channel_mask(channel_index)
63
- elif isinstance(channel_limit, int):
64
- max_channels = channel_limit
65
- ch_mask = cls.pixelstats.get_channel_mask(None, channel_limit)
66
- else:
67
- max_channels = raw_channels
68
- ch_mask = None
69
-
70
- if max_channels > raw_channels:
71
- max_channels = raw_channels
72
- if ch_mask is not None and not any(ch_mask):
73
- ch_mask = None
74
-
75
- return max_channels, ch_mask
76
-
77
-
78
- @dataclass(frozen=True)
79
- class ChannelStatsOutput(Output):
80
- """
81
- Output class for :func:`.channelstats` stats metric.
82
-
83
- This class represents the outputs of various per-channel stats functions against
84
- a single dataset, such that each index across all stat outputs are representative
85
- of the same source image. Modifying or mixing outputs will result in inaccurate
86
- outlier calculations if not created correctly.
87
-
88
- Attributes
89
- ----------
90
- pixelstats: PixelStatsOutput
91
- visualstats: VisualStatsOutput
92
- """
93
-
94
- pixelstats: PixelStatsOutput
95
- visualstats: VisualStatsOutput
96
-
97
- def _outputs(self) -> tuple[PixelStatsOutput, VisualStatsOutput]:
98
- return (self.pixelstats, self.visualstats)
99
-
100
- def dict(self) -> dict[str, Any]:
101
- return {**self.pixelstats.dict(), **self.visualstats.dict()}
102
-
103
- def __post_init__(self) -> None:
104
- lengths = [len(s) for s in self._outputs()]
105
- if not all(length == lengths[0] for length in lengths):
106
- raise ValueError("All StatsOutput classes must contain the same number of image sources.")
107
-
108
- def plot(
109
- self, log: bool, channel_limit: int | None = None, channel_index: int | Iterable[int] | None = None
110
- ) -> None:
111
- max_channels, ch_mask = _get_channels(self, channel_limit, channel_index)
112
- data_dict = {k: v for k, v in self.dict().items() if _is_plottable(k, v, ("histogram", "percentiles"))}
113
- channel_histogram_plot(data_dict, log, max_channels, ch_mask)
114
-
115
-
116
- @set_metadata
117
- def datasetstats(
118
- images: Iterable[ArrayLike],
119
- bboxes: Iterable[ArrayLike] | None = None,
120
- labels: Iterable[ArrayLike] | None = None,
121
- ) -> DatasetStatsOutput:
122
- """
123
- Calculates various :term:`statistics<Statistics>` for each image.
124
-
125
- This function computes dimension, pixel and visual metrics
126
- on the images or individual bounding boxes for each image as
127
- well as label statistics if provided.
128
-
129
- Parameters
130
- ----------
131
- images : Iterable[ArrayLike]
132
- Images to perform calculations on
133
- bboxes : Iterable[ArrayLike] or None
134
- Bounding boxes in `xyxy` format for each image to perform calculations on
135
- labels : Iterable[ArrayLike] or None
136
- Labels of images or boxes to perform calculations on
137
-
138
- Returns
139
- -------
140
- DatasetStatsOutput
141
- Output class containing the outputs of various stats functions
142
-
143
- See Also
144
- --------
145
- dimensionstats, labelstats, pixelstats, visualstats, Outliers
146
-
147
- Examples
148
- --------
149
- Calculating the dimension, pixel and visual stats for a dataset with bounding boxes
150
-
151
- >>> stats = datasetstats(stats_images, bboxes)
152
- >>> print(stats.dimensionstats.aspect_ratio)
153
- [ 0.864 0.5884 16. 1.143 1.692 0.5835 0.6665 2.555 1.3 ]
154
- >>> print(stats.visualstats.sharpness)
155
- [4.04 4.434 0.2778 4.957 5.145 5.22 4.957 3.076 2.855 ]
156
- """
157
- outputs = run_stats(images, bboxes, False, [DimensionStatsProcessor, PixelStatsProcessor, VisualStatsProcessor])
158
- return DatasetStatsOutput(*outputs, labelstats=labelstats(labels) if labels else None) # type: ignore
159
-
160
-
161
- @set_metadata
162
- def channelstats(
163
- images: Iterable[ArrayLike],
164
- bboxes: Iterable[ArrayLike] | None = None,
165
- ) -> ChannelStatsOutput:
166
- """
167
- Calculates various per-channel :term:`statistics` for each image.
168
-
169
- This function computes pixel and visual metrics on the images
170
- or individual bounding boxes for each image.
171
-
172
- Parameters
173
- ----------
174
- images : Iterable[ArrayLike]
175
- Images to perform calculations on
176
- bboxes : Iterable[ArrayLike] or None
177
- Bounding boxes in `xyxy` format for each image to perform calculations on
178
-
179
- Returns
180
- -------
181
- ChannelStatsOutput
182
- Output class containing the per-channel outputs of various stats functions
183
-
184
- See Also
185
- --------
186
- pixelstats, visualstats
187
-
188
- Examples
189
- --------
190
- Calculating the per-channel pixel and visual stats for a dataset
191
-
192
- >>> stats = channelstats(stats_images)
193
- >>> print(stats.visualstats.darkness)
194
- [0.1499 0.3499 0.55 0.2094 0.2219 0.2344 0.4194 0.6094 0.622 0.6343
195
- 0.8154]
196
- """
197
- outputs = run_stats(images, bboxes, True, [PixelStatsProcessor, VisualStatsProcessor])
198
- return ChannelStatsOutput(*outputs) # type: ignore