dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,194 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+ from mlebench.utils import extract, get_logger, read_csv
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def _create_split(
14
+ input_df: pd.DataFrame,
15
+ test_size: float,
16
+ random_state: int,
17
+ raw_images_path: Path,
18
+ public_path: Path,
19
+ private_path: Path,
20
+ dev_mode: bool = False,
21
+ ) -> pd.DataFrame:
22
+ """
23
+ Helper function to perform a data split, create necessary files, and organize directories.
24
+
25
+ Args:
26
+ input_df: The dataframe to be split.
27
+ test_size: The proportion of the dataset to allocate to the test split.
28
+ random_state: The seed used by the random number generator.
29
+ raw_images_path: Path to the directory containing all source images.
30
+ public_path: The target public directory.
31
+ private_path: The target private directory.
32
+ dev_mode: If True, uses a small sample for faster processing.
33
+
34
+ Returns:
35
+ The training portion of the split dataframe.
36
+ """
37
+ # Create train, test from the input dataframe
38
+ locations = input_df["location"].unique()
39
+ train_locations, test_locations = train_test_split(
40
+ locations, test_size=test_size, random_state=random_state
41
+ )
42
+
43
+ input_df["split"] = input_df["location"].apply(
44
+ lambda loc: "test" if loc in test_locations else "train"
45
+ )
46
+
47
+ train_df = input_df[input_df["split"] == "train"].drop(columns=["split"])
48
+ answers_df = input_df[input_df["split"] == "test"].drop(columns=["split"])
49
+
50
+ logger.debug("Train locations: %s", train_locations)
51
+ logger.debug("Test locations: %s", test_locations)
52
+ logger.debug(
53
+ "Test size for this split: %s",
54
+ len(answers_df) / (len(train_df) + len(answers_df)),
55
+ )
56
+
57
+ input_df.drop(columns=["split"], inplace=True) # Drop helper column
58
+
59
+ test_df = answers_df.copy().drop(columns=["category_id"])
60
+ gold_submission_df = answers_df.copy()[["id", "category_id"]]
61
+ gold_submission_df.rename(
62
+ columns={"id": "Id", "category_id": "Category"}, inplace=True
63
+ )
64
+
65
+ # Make sample submission
66
+ submission_df = test_df.copy()[["id"]]
67
+ submission_df["category_id"] = 0
68
+ submission_df.rename(columns={"id": "Id", "category_id": "Category"}, inplace=True)
69
+
70
+ # Checks
71
+ assert set(train_df["id"]).isdisjoint(
72
+ set(test_df["id"])
73
+ ), "train_df and test_df are not disjoint"
74
+ assert len(train_df) + len(test_df) == len(
75
+ input_df
76
+ ), "Length of train_df and test_df should be equal to the length of the input dataframe"
77
+ assert len(answers_df) == len(
78
+ test_df
79
+ ), "Length of answers_df should be equal to the length of test_df"
80
+ assert len(submission_df) == len(
81
+ answers_df
82
+ ), "Length of answers_df should be equal to the length of the sample submission"
83
+ assert (
84
+ input_df.columns.tolist() == train_df.columns.tolist()
85
+ ), f"train_df should have the same columns as the input dataframe: input_df: {input_df.columns.tolist()} != train_df: {train_df.columns.tolist()}"
86
+ assert set(train_df["location"]).isdisjoint(
87
+ set(test_df["location"])
88
+ ), "train_df and test_df should not share any locations"
89
+
90
+ # Create directories
91
+ public_path.mkdir(exist_ok=True, parents=True)
92
+ private_path.mkdir(exist_ok=True, parents=True)
93
+
94
+ # Write CSVs
95
+ answers_df.to_csv(private_path / "test.csv", index=False)
96
+ gold_submission_df.to_csv(private_path / "answers.csv", index=False)
97
+ train_df.to_csv(public_path / "train.csv", index=False)
98
+ test_df.to_csv(public_path / "test.csv", index=False)
99
+ submission_df.to_csv(public_path / "sample_submission.csv", index=True)
100
+
101
+ # Prepare for file copy
102
+ public_train_images = public_path / "train_images"
103
+ public_test_images = public_path / "test_images"
104
+ public_train_images.mkdir(exist_ok=True)
105
+ public_test_images.mkdir(exist_ok=True)
106
+
107
+ loop_train_df = train_df.sample(n=100) if dev_mode else train_df
108
+ loop_test_df = test_df.sample(n=100) if dev_mode else test_df
109
+
110
+ for file_id in tqdm(loop_train_df["id"], desc=f"Copying train images to {public_path.name}"):
111
+ shutil.copyfile(
112
+ src=raw_images_path / f"{file_id}.jpg",
113
+ dst=public_train_images / f"{file_id}.jpg",
114
+ )
115
+
116
+ for file_id in tqdm(loop_test_df["id"], desc=f"Copying test images to {public_path.name}"):
117
+ shutil.copyfile(
118
+ src=raw_images_path / f"{file_id}.jpg",
119
+ dst=public_test_images / f"{file_id}.jpg",
120
+ )
121
+
122
+ # Check integrity of the files copied
123
+ assert len(list(public_test_images.glob("*.jpg"))) == len(
124
+ loop_test_df["id"].unique()
125
+ ), f"Public test images in {public_path.name} should have the same number of images as the unique ids in the test set"
126
+ assert len(list(public_train_images.glob("*.jpg"))) == len(
127
+ loop_train_df["id"].unique()
128
+ ), f"Public train images in {public_path.name} should have the same number of images as the unique ids in the train set"
129
+
130
+ # Zip up image directories and delete non-zipped files
131
+ shutil.make_archive(
132
+ public_path / "train_images", "zip", public_train_images
133
+ )
134
+ shutil.make_archive(public_path / "test_images", "zip", public_test_images)
135
+ shutil.rmtree(public_train_images)
136
+ shutil.rmtree(public_test_images)
137
+
138
+ return train_df
139
+
140
+
141
+ def prepare(raw: Path, public: Path, private: Path):
142
+ """
143
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
144
+ Also creates a secondary validation split in public_val/private_val directories.
145
+ """
146
+ dev_mode = False
147
+ test_size = 0.1
148
+ random_state = 8 # We target a 44% test set size, we have empirically trialed seeds and landed on 8 to achieve this
149
+
150
+ # --- Setup and Initial Data Extraction ---
151
+ old_train = read_csv(raw / "train.csv")
152
+ raw_images_path = raw / "train_images"
153
+ raw_images_path.mkdir(exist_ok=True)
154
+ logger.info("Extracting raw images...")
155
+ extract(raw / "train_images.zip", raw_images_path)
156
+ assert len(list(raw_images_path.glob("*.jpg"))) == len(
157
+ old_train["id"].unique()
158
+ ), f"Raw train images should have the same number of images as the unique ids in the old train set, but got {len(list(raw_images_path.glob('*.jpg')))} files and {len(old_train['id'].unique())} ids"
159
+
160
+ # --- First Split (Original Public/Private) ---
161
+ # This creates the main competition data. The outputs in `public` and `private`
162
+ # will be identical to the original script's output.
163
+ logger.info("Creating original train/test split for competition...")
164
+ train_from_first_split = _create_split(
165
+ input_df=old_train,
166
+ test_size=test_size,
167
+ random_state=random_state,
168
+ raw_images_path=raw_images_path,
169
+ public_path=public,
170
+ private_path=private,
171
+ dev_mode=dev_mode,
172
+ )
173
+
174
+ # --- Second Split (New Validation Set) ---
175
+ # This takes the training data from the first split and splits it again
176
+ # to create a new, smaller training set and a validation set.
177
+ # The output structure in `public_val` and `private_val` mirrors the original.
178
+ logger.info("Creating validation train/test split for local development...")
179
+ public_val = public.parent / "public_val"
180
+ private_val = private.parent / "private_val"
181
+ _create_split(
182
+ input_df=train_from_first_split,
183
+ test_size=test_size,
184
+ random_state=random_state, # Use same params to replicate splitting logic
185
+ raw_images_path=raw_images_path,
186
+ public_path=public_val,
187
+ private_path=private_val,
188
+ dev_mode=dev_mode,
189
+ )
190
+
191
+ # Clean up the extracted raw images directory
192
+ logger.info("Cleaning up extracted raw images...")
193
+ shutil.rmtree(raw_images_path)
194
+ logger.info("Data preparation complete.")
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="Category", id_column="Id"
10
+ )
11
+ return accuracy_score(**accuracy_inputs)
@@ -0,0 +1,164 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+ from tqdm import tqdm
9
+
10
+ from mlebench.competitions.utils import df_to_one_hot
11
+ from mlebench.utils import read_csv
12
+
13
+
14
+ def prepare(raw: Path, public: Path, private: Path):
15
+ """
16
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
17
+ """
18
+
19
+ dev_mode = False
20
+
21
+ # Create train, test from train split
22
+
23
+ # Load old train
24
+ with open(raw / "iwildcam2020_train_annotations.json", "r") as file:
25
+ old_train_json = json.load(file)
26
+ old_train_annotations = pd.DataFrame(old_train_json["annotations"])
27
+ old_train_images = pd.DataFrame(old_train_json["images"])
28
+ old_train_categories = pd.DataFrame(old_train_json["categories"])
29
+ # old_train_info = pd.DataFrame(old_train_json["info"])
30
+
31
+ # Load old test
32
+ with open(raw / "iwildcam2020_test_information.json", "r") as file:
33
+ old_test_json = json.load(file)
34
+ old_test_categories = pd.DataFrame(old_test_json["categories"])
35
+
36
+ # Create splits based on train's images' on 'location'
37
+ test_size = 0.22 # 62894/(217959+62894) = 0.22
38
+ train_image_locations = old_train_images["location"].unique()
39
+ locations_new_train, locations_new_test = train_test_split(
40
+ train_image_locations, test_size=test_size, random_state=0
41
+ )
42
+
43
+ # Filter old train to new train and new test based on location
44
+ new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
45
+ new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
46
+
47
+ # Adjust the split to ensure around test_size of total samples are in the new test set
48
+ while len(new_test_images) / (len(old_train_images) + len(new_test_images)) < test_size:
49
+ # Move some locations from train to test
50
+ location_to_move = locations_new_train[-1]
51
+ locations_new_train = locations_new_train[:-1]
52
+ locations_new_test = np.append(locations_new_test, location_to_move)
53
+ new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
54
+ new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
55
+
56
+ while len(new_test_images) / (len(old_train_images) + len(new_test_images)) > test_size:
57
+ # Move some locations from test to train
58
+ location_to_move = locations_new_test[-1]
59
+ locations_new_test = locations_new_test[:-1]
60
+ locations_new_train = np.append(locations_new_train, location_to_move)
61
+ new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
62
+ new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]
63
+
64
+ # Get the image ids for new train and new test
65
+ new_train_ids = new_train_images["id"].unique()
66
+ new_test_ids = new_test_images["id"].unique()
67
+
68
+ # Filter annotations based on new train and new test image ids
69
+ new_train_annotations = old_train_annotations[
70
+ old_train_annotations["image_id"].isin(new_train_ids)
71
+ ]
72
+ new_test_annotations = old_train_annotations[
73
+ old_train_annotations["image_id"].isin(new_test_ids)
74
+ ]
75
+ new_train_categories = old_train_categories.copy()
76
+ new_test_categories = old_test_categories.copy()
77
+
78
+ # Answers
79
+ answer_annotations = new_test_annotations[["image_id", "category_id"]].copy()
80
+ answer_annotations.rename(columns={"image_id": "Id", "category_id": "Category"}, inplace=True)
81
+
82
+ # Create a sample submission file
83
+ sample_submission = answer_annotations.copy()
84
+ np.random.seed(0)
85
+ sample_submission["Category"] = np.random.randint(
86
+ 0, 676, size=len(sample_submission)
87
+ ) # Uniform between 0 and 675
88
+
89
+ # Checks
90
+ assert set(new_train_annotations["image_id"]).isdisjoint(
91
+ set(new_test_images["id"])
92
+ ), "Train should not contain annotations of test images"
93
+ assert len(new_train_ids) + len(new_test_ids) == len(
94
+ old_train_images["id"]
95
+ ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train_images"
96
+ # Assert that new_train_images and new_test_images have disjoint locations
97
+ assert set(new_train_images["location"]).isdisjoint(
98
+ set(new_test_images["location"])
99
+ ), "Train and test images should not share locations"
100
+
101
+ # Reform JSON files
102
+ new_train_json = {
103
+ "annotations": new_train_annotations.to_dict(orient="records"),
104
+ "images": new_train_images.to_dict(orient="records"),
105
+ "categories": new_train_categories.to_dict(orient="records"),
106
+ "info": old_train_json["info"],
107
+ }
108
+
109
+ new_test_json = {
110
+ "images": new_test_images.to_dict(orient="records"),
111
+ "categories": new_test_categories.to_dict(orient="records"),
112
+ "info": old_test_json["info"],
113
+ }
114
+
115
+ # Checks on JSONs
116
+ assert (
117
+ new_train_json.keys() == old_train_json.keys()
118
+ ), "new_train_json and old_train_json should have the same keys"
119
+ assert (
120
+ new_test_json.keys() == old_test_json.keys()
121
+ ), "new_test_json and old_test_json should have the same keys"
122
+
123
+ # Write files
124
+ answer_annotations.to_csv(private / "answers.csv", index=False)
125
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
126
+ with open(public / "iwildcam2020_train_annotations.json", "w") as f:
127
+ json.dump(new_train_json, f)
128
+ with open(public / "iwildcam2020_test_information.json", "w") as f:
129
+ json.dump(new_test_json, f)
130
+
131
+ # Copy over megadetector results
132
+ shutil.copyfile(
133
+ raw / "iwildcam2020_megadetector_results.json",
134
+ public / "iwildcam2020_megadetector_results.json",
135
+ )
136
+
137
+ # Reduce the number of images copied over to 100 for dev mode
138
+ if dev_mode:
139
+ new_train_ids = new_train_ids[:100]
140
+ new_test_ids = new_test_ids[:100]
141
+
142
+ # Copy over image files
143
+ (public / "train").mkdir(exist_ok=True)
144
+ (public / "test").mkdir(exist_ok=True)
145
+
146
+ for file_id in tqdm(new_train_ids, desc="Copying train images", unit="file"):
147
+ shutil.copyfile(
148
+ src=raw / "train" / f"{file_id}.jpg",
149
+ dst=public / "train" / f"{file_id}.jpg",
150
+ )
151
+
152
+ for file_id in tqdm(new_test_ids, desc="Copying test images", unit="file"):
153
+ shutil.copyfile(
154
+ src=raw / "train" / f"{file_id}.jpg",
155
+ dst=public / "test" / f"{file_id}.jpg",
156
+ )
157
+
158
+ # Check integrity of the files copied
159
+ assert len(list(public.glob("train/*.jpg"))) == len(
160
+ new_train_ids
161
+ ), "Number of train images should be equal to the number of unique image_id in the train set"
162
+ assert len(list(public.glob("test/*.jpg"))) == len(
163
+ new_test_ids
164
+ ), "Number of test images should be equal to the number of unique image_id in the test set"
@@ -0,0 +1,245 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import Dict, Tuple
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from tqdm import tqdm
10
+
11
+ from mlebench.competitions.utils import df_to_one_hot
12
+ from mlebench.utils import read_csv
13
+
14
+
15
+ def _perform_split(
16
+ images_to_split: pd.DataFrame, test_size: float, random_state: int
17
+ ) -> Tuple[np.ndarray, np.ndarray]:
18
+ """Performs a location-based split on the given images DataFrame."""
19
+ image_locations = images_to_split["location"].unique()
20
+ train_locations, test_locations = train_test_split(
21
+ image_locations, test_size=test_size, random_state=random_state
22
+ )
23
+
24
+ # The original script's logic for fine-tuning the split size
25
+ temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
26
+ temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
27
+
28
+ while len(temp_test_images) / len(images_to_split) < test_size and len(train_locations) > 1:
29
+ location_to_move = train_locations[-1]
30
+ train_locations = train_locations[:-1]
31
+ test_locations = np.append(test_locations, location_to_move)
32
+ temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
33
+ temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
34
+
35
+ while len(temp_test_images) / len(images_to_split) > test_size and len(test_locations) > 1:
36
+ location_to_move = test_locations[-1]
37
+ test_locations = test_locations[:-1]
38
+ train_locations = np.append(train_locations, location_to_move)
39
+ temp_train_images = images_to_split[images_to_split["location"].isin(train_locations)]
40
+ temp_test_images = images_to_split[images_to_split["location"].isin(test_locations)]
41
+
42
+ return train_locations, test_locations
43
+
44
+
45
+ def _write_dataset_files(
46
+ public_path: Path,
47
+ private_path: Path,
48
+ train_images: pd.DataFrame,
49
+ test_images: pd.DataFrame,
50
+ train_annotations: pd.DataFrame,
51
+ test_annotations: pd.DataFrame,
52
+ categories_df: pd.DataFrame,
53
+ info_json: Dict,
54
+ test_info_json: Dict,
55
+ raw_path: Path,
56
+ dev_mode: bool,
57
+ ):
58
+ """Writes all the necessary files for a given train/test split to the specified paths."""
59
+ # Create output directories
60
+ public_path.mkdir(exist_ok=True)
61
+ private_path.mkdir(exist_ok=True)
62
+
63
+ # Answers
64
+ answer_annotations = test_annotations[["image_id", "category_id"]].copy()
65
+ answer_annotations.rename(columns={"image_id": "Id", "category_id": "Category"}, inplace=True)
66
+
67
+ # Create a sample submission file
68
+ sample_submission = answer_annotations.copy()
69
+ np.random.seed(0)
70
+ sample_submission["Category"] = np.random.randint(
71
+ 0, 676, size=len(sample_submission)
72
+ ) # Uniform between 0 and 675
73
+
74
+ # Reform JSON files
75
+ new_train_json = {
76
+ "annotations": train_annotations.to_dict(orient="records"),
77
+ "images": train_images.to_dict(orient="records"),
78
+ "categories": categories_df.to_dict(orient="records"),
79
+ "info": info_json,
80
+ }
81
+
82
+ new_test_json = {
83
+ "images": test_images.to_dict(orient="records"),
84
+ "categories": pd.DataFrame(test_info_json["categories"]).to_dict(orient="records"),
85
+ "info": test_info_json["info"],
86
+ }
87
+
88
+ # Write files
89
+ answer_annotations.to_csv(private_path / "answers.csv", index=False)
90
+ sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
91
+ with open(public_path / "iwildcam2020_train_annotations.json", "w") as f:
92
+ json.dump(new_train_json, f)
93
+ with open(public_path / "iwildcam2020_test_information.json", "w") as f:
94
+ json.dump(new_test_json, f)
95
+
96
+ # Copy over megadetector results
97
+ shutil.copyfile(
98
+ raw_path / "iwildcam2020_megadetector_results.json",
99
+ public_path / "iwildcam2020_megadetector_results.json",
100
+ )
101
+
102
+ train_ids_to_copy = train_images["id"].unique()
103
+ test_ids_to_copy = test_images["id"].unique()
104
+
105
+ # Reduce the number of images copied over to 100 for dev mode
106
+ if dev_mode:
107
+ train_ids_to_copy = train_ids_to_copy[:100]
108
+ test_ids_to_copy = test_ids_to_copy[:100]
109
+
110
+ # Copy over image files
111
+ (public_path / "train").mkdir(exist_ok=True)
112
+ (public_path / "test").mkdir(exist_ok=True)
113
+
114
+ print(f"Copying images to {public_path}...")
115
+ for file_id in tqdm(train_ids_to_copy, desc="Copying train images", unit="file"):
116
+ shutil.copyfile(
117
+ src=raw_path / "train" / f"{file_id}.jpg",
118
+ dst=public_path / "train" / f"{file_id}.jpg",
119
+ )
120
+
121
+ for file_id in tqdm(test_ids_to_copy, desc="Copying test images", unit="file"):
122
+ shutil.copyfile(
123
+ src=raw_path / "train" / f"{file_id}.jpg",
124
+ dst=public_path / "test" / f"{file_id}.jpg",
125
+ )
126
+
127
+ # Check integrity of the files copied
128
+ assert len(list(public_path.glob("train/*.jpg"))) == len(
129
+ train_ids_to_copy
130
+ ), "Number of train images should be equal to the number of unique image_id in the train set"
131
+ assert len(list(public_path.glob("test/*.jpg"))) == len(
132
+ test_ids_to_copy
133
+ ), "Number of test images should be equal to the number of unique image_id in the test set"
134
+
135
+
136
+ def prepare(raw: Path, public: Path, private: Path):
137
+ """
138
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
139
+ Also creates a second, parallel validation split (public_val, private_val).
140
+ """
141
+
142
+ dev_mode = False
143
+
144
+ # Define paths for the new validation split
145
+ public_val = public.parent / "public_val"
146
+ private_val = private.parent / "private_val"
147
+
148
+ # Load raw data once
149
+ with open(raw / "iwildcam2020_train_annotations.json", "r") as file:
150
+ old_train_json = json.load(file)
151
+ old_train_annotations = pd.DataFrame(old_train_json["annotations"])
152
+ old_train_images = pd.DataFrame(old_train_json["images"])
153
+ old_train_categories = pd.DataFrame(old_train_json["categories"])
154
+
155
+ with open(raw / "iwildcam2020_test_information.json", "r") as file:
156
+ old_test_json = json.load(file)
157
+
158
+ # ==================================================================
159
+ # 1. Create the original Train / Test split
160
+ # ==================================================================
161
+ print("--- Creating original Train/Test split ---")
162
+ test_size_orig = 0.22 # 62894/(217959+62894) = 0.22
163
+ locations_train, locations_test = _perform_split(
164
+ images_to_split=old_train_images, test_size=test_size_orig, random_state=0
165
+ )
166
+
167
+ # Filter original data to create the first train/test sets
168
+ train_images = old_train_images[old_train_images["location"].isin(locations_train)]
169
+ test_images = old_train_images[old_train_images["location"].isin(locations_test)]
170
+ train_ids = train_images["id"].unique()
171
+ test_ids = test_images["id"].unique()
172
+ train_annotations = old_train_annotations[old_train_annotations["image_id"].isin(train_ids)]
173
+ test_annotations = old_train_annotations[old_train_annotations["image_id"].isin(test_ids)]
174
+
175
+ # Checks
176
+ assert set(train_annotations["image_id"]).isdisjoint(
177
+ set(test_images["id"])
178
+ ), "Train should not contain annotations of test images"
179
+ assert len(train_ids) + len(test_ids) == len(
180
+ old_train_images["id"]
181
+ ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train_images"
182
+ assert set(train_images["location"]).isdisjoint(
183
+ set(test_images["location"])
184
+ ), "Train and test images should not share locations"
185
+
186
+ # Write files for the original public/private split
187
+ _write_dataset_files(
188
+ public_path=public,
189
+ private_path=private,
190
+ train_images=train_images,
191
+ test_images=test_images,
192
+ train_annotations=train_annotations,
193
+ test_annotations=test_annotations,
194
+ categories_df=old_train_categories,
195
+ info_json=old_train_json["info"],
196
+ test_info_json=old_test_json,
197
+ raw_path=raw,
198
+ dev_mode=dev_mode,
199
+ )
200
+
201
+ # ==================================================================
202
+ # 2. Create the new Train / Validation split from the first training set
203
+ # ==================================================================
204
+ print("\n--- Creating new Train/Validation split ---")
205
+ # The new split is performed on the `train_images` from the *first* split.
206
+ # We calculate the test_size to make the new validation set have the same
207
+ # number of images as the original test set.
208
+ test_size_val = len(test_images) / len(train_images)
209
+
210
+ locations_train_val, locations_test_val = _perform_split(
211
+ images_to_split=train_images, test_size=test_size_val, random_state=0
212
+ )
213
+
214
+ # Filter the first training set to create the second (train_val/test_val) sets
215
+ train_val_images = train_images[train_images["location"].isin(locations_train_val)]
216
+ test_val_images = train_images[train_images["location"].isin(locations_test_val)]
217
+ train_val_ids = train_val_images["id"].unique()
218
+ test_val_ids = test_val_images["id"].unique()
219
+ train_val_annotations = train_annotations[train_annotations["image_id"].isin(train_val_ids)]
220
+ test_val_annotations = train_annotations[train_annotations["image_id"].isin(test_val_ids)]
221
+
222
+ # Checks
223
+ assert set(train_val_images["location"]).isdisjoint(
224
+ set(test_val_images["location"])
225
+ ), "Validation train and test images should not share locations"
226
+ assert len(train_val_ids) + len(test_val_ids) == len(
227
+ train_ids
228
+ ), "The combined length of train_val and test_val should equal the length of the original train set"
229
+
230
+ # Write files for the new public_val/private_val split
231
+ _write_dataset_files(
232
+ public_path=public_val,
233
+ private_path=private_val,
234
+ train_images=train_val_images,
235
+ test_images=test_val_images,
236
+ train_annotations=train_val_annotations,
237
+ test_annotations=test_val_annotations,
238
+ categories_df=old_train_categories,
239
+ info_json=old_train_json["info"],
240
+ test_info_json=old_test_json,
241
+ raw_path=raw,
242
+ dev_mode=dev_mode,
243
+ )
244
+
245
+ print("\nData preparation complete.")
@@ -0,0 +1 @@
1
+ CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
@@ -0,0 +1,54 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import roc_auc_score
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+
6
+ from .classes import CLASSES
7
+
8
+
9
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
10
+ classes_set = set(CLASSES)
11
+ id_col = "id"
12
+
13
+ # submission checks
14
+ if id_col not in submission.columns:
15
+ raise InvalidSubmissionError(f"Submission is missing the column: {id_col}")
16
+ sub_cols_set = set(submission.columns)
17
+ if not classes_set.issubset(sub_cols_set):
18
+ raise InvalidSubmissionError(
19
+ f"Submission is missing the following columns: {classes_set - sub_cols_set}"
20
+ )
21
+
22
+ # answers checks
23
+ assert id_col in answers.columns, f"Answers is missing `{id_col}` column"
24
+ ans_cols_set = set(answers.columns)
25
+ assert classes_set.issubset(
26
+ ans_cols_set
27
+ ), f"Answers is missing the following columns: {classes_set - ans_cols_set}"
28
+ assert len(submission) == len(answers), "Submission and answers have different lengths"
29
+
30
+ submission = submission.set_index("id").sort_index()
31
+ answers = answers.set_index("id").sort_index()
32
+
33
+ # skip rows marked with -1 in y_true
34
+ # when it happens entire row is marked so we can check negative sum of the row
35
+ keep_mask = answers[CLASSES].sum(axis=1) >= 0
36
+ answers = answers[keep_mask]
37
+ submission = submission[keep_mask]
38
+
39
+ roc_auc_inputs = {
40
+ "y_score": submission.to_numpy(),
41
+ "y_true": answers.to_numpy(),
42
+ # metric for each column, then average across columns
43
+ "average": "macro",
44
+ }
45
+
46
+ return roc_auc_inputs
47
+
48
+
49
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
50
+ """
51
+ Computes the column-wise mean ROC AUC score for the submission.
52
+ """
53
+ roc_auc_inputs = prepare_for_metric(submission, answers)
54
+ return roc_auc_score(**roc_auc_inputs)