dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,76 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm.auto import tqdm
7
+
8
+ from mlebench.utils import read_csv
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+
13
+ old_train = read_csv(raw / "train.csv")
14
+
15
+ np_rng = np.random.default_rng(0)
16
+
17
+ # Original ratio: 6800/(9912 + 6800) = ~ 0.41 test_size
18
+ # We use 0.1 ratio to avoid taking out too many samples from train
19
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
20
+
21
+ new_test_without_labels = new_test.drop(columns=["Pawpularity"])
22
+
23
+ # random floats between 1 and 100 inclusive, with 2 decimal places
24
+ sample_submission = new_test[["Id", "Pawpularity"]].copy()
25
+ sample_submission["Pawpularity"] = np_rng.uniform(1, 100, len(sample_submission)).round(2)
26
+
27
+ new_train.to_csv(public / "train.csv", index=False)
28
+ new_test.to_csv(private / "test.csv", index=False)
29
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
30
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
31
+
32
+ (public / "train").mkdir(exist_ok=True)
33
+ for img_id in tqdm(new_train["Id"], desc="Copying train images", total=len(new_train)):
34
+ shutil.copy(raw / "train" / f"{img_id}.jpg", public / "train" / f"{img_id}.jpg")
35
+
36
+ (public / "test").mkdir(exist_ok=True)
37
+ for img_id in tqdm(
38
+ new_test_without_labels["Id"],
39
+ desc="Copying test images",
40
+ total=len(new_test_without_labels),
41
+ ):
42
+ shutil.copy(raw / "train" / f"{img_id}.jpg", public / "test" / f"{img_id}.jpg")
43
+
44
+ # checks
45
+ assert len(new_train) + len(new_test) == len(
46
+ old_train
47
+ ), "Train and test length should sum to the original train length"
48
+ assert len(sample_submission) == len(
49
+ new_test
50
+ ), "Sample submission should have the same length as the test set"
51
+
52
+ assert (
53
+ new_train.columns.tolist() == old_train.columns.tolist()
54
+ ), "Old and new train columns should match"
55
+ assert (
56
+ new_test_without_labels.columns.tolist() == new_train.columns.tolist()[:-1]
57
+ ), "Public test columns should match train columns, minus the target column"
58
+ assert (
59
+ new_test.columns.tolist() == new_train.columns.tolist()
60
+ ), "Private test columns should match train columns"
61
+ assert sample_submission.columns.tolist() == [
62
+ "Id",
63
+ "Pawpularity",
64
+ ], "Sample submission columns should be Id, Pawpularity"
65
+
66
+ assert set(new_train["Id"]).isdisjoint(
67
+ set(new_test["Id"])
68
+ ), "Train and test ids should not overlap"
69
+
70
+ # check copy was successful
71
+ assert len(list((public / "train").glob("*.jpg"))) == len(
72
+ new_train
73
+ ), "Train images should match the train set"
74
+ assert len(list((public / "test").glob("*.jpg"))) == len(
75
+ new_test
76
+ ), "Test images should match the test set"
@@ -0,0 +1,154 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from tqdm.auto import tqdm
8
+
9
+ from mlebench.utils import read_csv
10
+
11
+
12
+ def _process_split(
13
+ source_df: pd.DataFrame,
14
+ public_dir: Path,
15
+ private_dir: Path,
16
+ raw_images_dir: Path,
17
+ test_size: float,
18
+ random_state: int,
19
+ ):
20
+ """
21
+ Processes a single data split, creating train/test sets and associated files.
22
+
23
+ This helper function encapsulates the logic for:
24
+ 1. Splitting a dataframe into train and test sets.
25
+ 2. Creating public and private directories.
26
+ 3. Saving train.csv, test.csv (public), test.csv (private), and sample_submission.csv.
27
+ 4. Copying the corresponding images.
28
+ 5. Running assertions to verify the split.
29
+
30
+ Args:
31
+ source_df (pd.DataFrame): The dataframe to be split.
32
+ public_dir (Path): The public output directory.
33
+ private_dir (Path): The private output directory.
34
+ raw_images_dir (Path): The directory containing the source raw images.
35
+ test_size (float): The proportion of the dataset to allocate to the test split.
36
+ random_state (int): The random state for reproducibility.
37
+
38
+ Returns:
39
+ pd.DataFrame: The train portion of the split dataframe.
40
+ """
41
+ # Create output directories
42
+ public_dir.mkdir(exist_ok=True, parents=True)
43
+ private_dir.mkdir(exist_ok=True, parents=True)
44
+
45
+ # Perform the split
46
+ train_df, test_df = train_test_split(
47
+ source_df, test_size=test_size, random_state=random_state
48
+ )
49
+
50
+ test_df_without_labels = test_df.drop(columns=["Pawpularity"])
51
+
52
+ # Create a sample submission file
53
+ np_rng = np.random.default_rng(random_state)
54
+ sample_submission = test_df[["Id", "Pawpularity"]].copy()
55
+ sample_submission["Pawpularity"] = np_rng.uniform(1, 100, len(sample_submission)).round(2)
56
+
57
+ # Save CSV files
58
+ train_df.to_csv(public_dir / "train.csv", index=False)
59
+ test_df.to_csv(private_dir / "test.csv", index=False)
60
+ test_df_without_labels.to_csv(public_dir / "test.csv", index=False)
61
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
62
+
63
+ # Copy train images
64
+ (public_dir / "train").mkdir(exist_ok=True)
65
+ for img_id in tqdm(
66
+ train_df["Id"], desc=f"Copying train images to {public_dir.name}", total=len(train_df)
67
+ ):
68
+ shutil.copy(raw_images_dir / f"{img_id}.jpg", public_dir / "train" / f"{img_id}.jpg")
69
+
70
+ # Copy test images
71
+ (public_dir / "test").mkdir(exist_ok=True)
72
+ for img_id in tqdm(
73
+ test_df_without_labels["Id"],
74
+ desc=f"Copying test images to {public_dir.name}",
75
+ total=len(test_df_without_labels),
76
+ ):
77
+ shutil.copy(raw_images_dir / f"{img_id}.jpg", public_dir / "test" / f"{img_id}.jpg")
78
+
79
+ # checks
80
+ assert len(train_df) + len(test_df) == len(
81
+ source_df
82
+ ), "Train and test length should sum to the source df length"
83
+ assert len(sample_submission) == len(
84
+ test_df
85
+ ), "Sample submission should have the same length as the test set"
86
+ assert (
87
+ train_df.columns.tolist() == source_df.columns.tolist()
88
+ ), "Train columns should match source columns"
89
+ assert (
90
+ test_df_without_labels.columns.tolist() == train_df.columns.tolist()[:-1]
91
+ ), "Public test columns should match train columns, minus the target column"
92
+ assert (
93
+ test_df.columns.tolist() == train_df.columns.tolist()
94
+ ), "Private test columns should match train columns"
95
+ assert sample_submission.columns.tolist() == [
96
+ "Id",
97
+ "Pawpularity",
98
+ ], "Sample submission columns should be Id, Pawpularity"
99
+ assert set(train_df["Id"]).isdisjoint(
100
+ set(test_df["Id"])
101
+ ), "Train and test ids should not overlap"
102
+ assert len(list((public_dir / "train").glob("*.jpg"))) == len(
103
+ train_df
104
+ ), "Train images should match the train set"
105
+ assert len(list((public_dir / "test").glob("*.jpg"))) == len(
106
+ test_df
107
+ ), "Test images should match the test set"
108
+
109
+ return train_df
110
+
111
+
112
+ def prepare(raw: Path, public: Path, private: Path):
113
+
114
+ old_train = read_csv(raw / "train.csv")
115
+ raw_images_dir = raw / "train"
116
+
117
+ # --- First Split: Create the original train/test sets ---
118
+ # This split creates the main `public` and `private` directories.
119
+ # Its outputs must remain identical to the original script.
120
+ # Original ratio: 6800/(9912 + 6800) = ~ 0.41 test_size
121
+ # We use 0.1 ratio to avoid taking out too many samples from train
122
+ original_test_size = 0.1
123
+ train_from_first_split = _process_split(
124
+ source_df=old_train,
125
+ public_dir=public,
126
+ private_dir=private,
127
+ raw_images_dir=raw_images_dir,
128
+ test_size=original_test_size,
129
+ random_state=0,
130
+ )
131
+
132
+ # --- Second Split: Create the new validation sets from the first split's train set ---
133
+ # Define new paths for the validation set outputs.
134
+ public_val = public.parent / "public_val"
135
+ private_val = private.parent / "private_val"
136
+
137
+ # Calculate the test size for the second split to make the new test_val set
138
+ # have approximately the same size as the original test set.
139
+ # size(test_val) = size(test_original)
140
+ # test_size_val * size(train_from_first_split) = original_test_size * size(old_train)
141
+ # test_size_val * (1 - original_test_size) * size(old_train) = original_test_size * size(old_train)
142
+ # test_size_val = original_test_size / (1 - original_test_size)
143
+ val_test_size = original_test_size / (1 - original_test_size)
144
+
145
+ # This split creates `public_val` and `private_val` directories.
146
+ # The random_state is kept the same for consistency.
147
+ _process_split(
148
+ source_df=train_from_first_split,
149
+ public_dir=public_val,
150
+ private_dir=private_val,
151
+ raw_images_dir=raw_images_dir,
152
+ test_size=val_test_size,
153
+ random_state=0,
154
+ )
@@ -0,0 +1,41 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import roc_auc_score
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
9
+ expected_answers_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
10
+ expected_sample_submission_columns = expected_answers_columns
11
+
12
+ assert set(answers.columns) == set(
13
+ expected_answers_columns
14
+ ), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
15
+
16
+ if len(submission) != len(answers):
17
+ raise InvalidSubmissionError(
18
+ f"Expected submission to have {len(answers)} rows but got {len(submission)}"
19
+ )
20
+
21
+ if set(submission.columns) != set(expected_sample_submission_columns):
22
+ raise InvalidSubmissionError(
23
+ f"Expected submission to have columns {expected_sample_submission_columns} but got {submission.columns}"
24
+ )
25
+
26
+ submission_sorted = submission.sort_values(by="image_id").reset_index(drop=True)
27
+ answers_sorted = answers.sort_values(by="image_id").reset_index(drop=True)
28
+
29
+ if not all(submission_sorted["image_id"] == answers_sorted["image_id"]):
30
+ raise InvalidSubmissionError(
31
+ "Submission and answers DataFrames must have matching 'image_id' columns."
32
+ )
33
+
34
+ class_columns = ["healthy", "multiple_diseases", "rust", "scab"]
35
+ scores = []
36
+
37
+ for cls in class_columns:
38
+ score = roc_auc_score(y_true=answers_sorted[cls], y_score=submission_sorted[cls])
39
+ scores.append(score)
40
+
41
+ return np.mean(scores)
@@ -0,0 +1,74 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path) -> None:
11
+ expected_train_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
12
+ expected_test_columns = ["image_id"]
13
+ expected_answers_columns = expected_train_columns
14
+ expected_sample_submission_columns = expected_train_columns
15
+
16
+ old_train = read_csv(raw / "train.csv")
17
+ new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
18
+
19
+ assert set(new_train.columns) == set(
20
+ expected_train_columns
21
+ ), f"Expected `new_train` to have columns {expected_train_columns} but got {new_train.columns}"
22
+
23
+ assert set(answers.columns) == set(
24
+ expected_answers_columns
25
+ ), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
26
+
27
+ new_train_image_ids = new_train["image_id"].unique()
28
+ new_test_image_ids = answers["image_id"].unique()
29
+ to_new_image_id = {
30
+ **{old_id: f"Train_{i}" for i, old_id in enumerate(new_train_image_ids)},
31
+ **{old_id: f"Test_{i}" for i, old_id in enumerate(new_test_image_ids)},
32
+ }
33
+
34
+ new_train["image_id"] = new_train["image_id"].replace(to_new_image_id)
35
+ answers["image_id"] = answers["image_id"].replace(to_new_image_id)
36
+
37
+ new_test = answers[["image_id"]].copy()
38
+
39
+ assert set(new_test.columns) == set(
40
+ expected_test_columns
41
+ ), f"Expected `new_test` to have columns {expected_test_columns} but got {new_test.columns}"
42
+
43
+ sample_submission = answers[["image_id"]].copy()
44
+ sample_submission[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
45
+
46
+ assert set(sample_submission.columns) == set(
47
+ expected_sample_submission_columns
48
+ ), f"Expected `sample_submission` to have columns {expected_sample_submission_columns} but got {sample_submission.columns}"
49
+
50
+ private.mkdir(exist_ok=True, parents=True)
51
+ public.mkdir(exist_ok=True, parents=True)
52
+ (public / "images").mkdir(exist_ok=True)
53
+
54
+ for old_image_id in tqdm(old_train["image_id"], desc="Copying over train & test images"):
55
+ assert old_image_id.startswith(
56
+ "Train_"
57
+ ), f"Expected train image id `{old_image_id}` to start with `Train_`."
58
+
59
+ new_image_id = to_new_image_id.get(old_image_id, old_image_id)
60
+
61
+ assert (
62
+ raw / "images" / f"{old_image_id}.jpg"
63
+ ).exists(), f"Image `{old_image_id}.jpg` does not exist in `{raw / 'images'}`."
64
+
65
+ shutil.copyfile(
66
+ src=raw / "images" / f"{old_image_id}.jpg",
67
+ dst=public / "images" / f"{new_image_id}.jpg",
68
+ )
69
+
70
+ answers.to_csv(private / "test.csv", index=False)
71
+
72
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
73
+ new_test.to_csv(public / "test.csv", index=False)
74
+ new_train.to_csv(public / "train.csv", index=False)
@@ -0,0 +1,160 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path) -> None:
11
+ # --- Expected Column Definitions (used for both splits) ---
12
+ expected_train_columns = ["image_id", "healthy", "multiple_diseases", "rust", "scab"]
13
+ expected_test_columns = ["image_id"]
14
+ expected_answers_columns = expected_train_columns
15
+ expected_sample_submission_columns = expected_train_columns
16
+
17
+ # =================================================================
18
+ # == STAGE 1: Create the original train/test split. ==
19
+ # == This section is preserved to ensure the original `public` ==
20
+ # == and `private` directories are identical to the original script. ==
21
+ # =================================================================
22
+
23
+ old_train = read_csv(raw / "train.csv")
24
+ new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
25
+
26
+ assert set(new_train.columns) == set(
27
+ expected_train_columns
28
+ ), f"Expected `new_train` to have columns {expected_train_columns} but got {new_train.columns}"
29
+
30
+ assert set(answers.columns) == set(
31
+ expected_answers_columns
32
+ ), f"Expected `answers` to have columns {expected_answers_columns} but got {answers.columns}"
33
+
34
+ new_train_image_ids = new_train["image_id"].unique()
35
+ new_test_image_ids = answers["image_id"].unique()
36
+ to_new_image_id = {
37
+ **{old_id: f"Train_{i}" for i, old_id in enumerate(new_train_image_ids)},
38
+ **{old_id: f"Test_{i}" for i, old_id in enumerate(new_test_image_ids)},
39
+ }
40
+
41
+ # IMPORTANT: The `new_train` DataFrame is modified here and will be used
42
+ # as the input for the second split. We make a copy to preserve it
43
+ # before its image_ids are changed in-place for the first split's output.
44
+ train_for_val_split = new_train.copy()
45
+ new_train["image_id"] = new_train["image_id"].replace(to_new_image_id)
46
+ answers["image_id"] = answers["image_id"].replace(to_new_image_id)
47
+
48
+ new_test = answers[["image_id"]].copy()
49
+
50
+ assert set(new_test.columns) == set(
51
+ expected_test_columns
52
+ ), f"Expected `new_test` to have columns {expected_test_columns} but got {new_test.columns}"
53
+
54
+ sample_submission = answers[["image_id"]].copy()
55
+ sample_submission[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
56
+
57
+ assert set(sample_submission.columns) == set(
58
+ expected_sample_submission_columns
59
+ ), f"Expected `sample_submission` to have columns {expected_sample_submission_columns} but got {sample_submission.columns}"
60
+
61
+ private.mkdir(exist_ok=True, parents=True)
62
+ public.mkdir(exist_ok=True, parents=True)
63
+ (public / "images").mkdir(exist_ok=True)
64
+
65
+ # Note: This loop copies ALL images defined in the original raw train set.
66
+ for old_image_id in tqdm(old_train["image_id"], desc="Copying over train & test images"):
67
+ assert old_image_id.startswith(
68
+ "Train_"
69
+ ), f"Expected train image id `{old_image_id}` to start with `Train_`."
70
+
71
+ new_image_id = to_new_image_id.get(old_image_id, old_image_id)
72
+
73
+ assert (
74
+ raw / "images" / f"{old_image_id}.jpg"
75
+ ).exists(), f"Image `{old_image_id}.jpg` does not exist in `{raw / 'images'}`."
76
+
77
+ shutil.copyfile(
78
+ src=raw / "images" / f"{old_image_id}.jpg",
79
+ dst=public / "images" / f"{new_image_id}.jpg",
80
+ )
81
+
82
+ answers.to_csv(private / "test.csv", index=False)
83
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
84
+ new_test.to_csv(public / "test.csv", index=False)
85
+ new_train.to_csv(public / "train.csv", index=False)
86
+
87
+ # =================================================================
88
+ # == STAGE 2: Create the new validation split. ==
89
+ # == This section splits the `new_train` set from STAGE 1 to ==
90
+ # == create a smaller training set and a validation set. ==
91
+ # =================================================================
92
+ print("\nStarting second split to create validation set...")
93
+
94
+ # Define paths for the new validation set directories
95
+ public_val = public.parent / "public_val"
96
+ private_val = private.parent / "private_val"
97
+
98
+ # To get a validation test set of roughly the same size as the original
99
+ # test set (10% of total), we must take 1/9th of the training set (90% of total).
100
+ # (0.1 * total) / (0.9 * total) = 1/9
101
+ val_test_size = 1 / 9.0
102
+
103
+ # Perform the second split on the original training data
104
+ train_val, answers_val = train_test_split(
105
+ train_for_val_split, test_size=val_test_size, random_state=0
106
+ )
107
+
108
+ # --- Replicate the ID renaming and file creation logic for the new split ---
109
+
110
+ train_val_image_ids = train_val["image_id"].unique()
111
+ test_val_image_ids = answers_val["image_id"].unique()
112
+ to_new_val_image_id = {
113
+ **{old_id: f"Train_{i}" for i, old_id in enumerate(train_val_image_ids)},
114
+ **{old_id: f"Test_{i}" for i, old_id in enumerate(test_val_image_ids)},
115
+ }
116
+
117
+ # The original image IDs from this split are keys in `to_new_image_id`.
118
+ # We need to map them to find the source file in `public/images`.
119
+ # e.g., raw 'Train_123' -> 1st split 'Train_45' -> 2nd split 'Test_6'
120
+ id_mapper_raw_to_val = {
121
+ raw_id: val_id
122
+ for raw_id, val_id in to_new_val_image_id.items()
123
+ }
124
+
125
+ # Get the intermediate filenames from the first split's mapping
126
+ source_to_dest_val_map = {
127
+ to_new_image_id[raw_id]: val_id
128
+ for raw_id, val_id in id_mapper_raw_to_val.items()
129
+ }
130
+
131
+
132
+ train_val["image_id"] = train_val["image_id"].replace(to_new_val_image_id)
133
+ answers_val["image_id"] = answers_val["image_id"].replace(to_new_val_image_id)
134
+
135
+ test_val = answers_val[["image_id"]].copy()
136
+ sample_submission_val = answers_val[["image_id"]].copy()
137
+ sample_submission_val[["healthy", "multiple_diseases", "rust", "scab"]] = 0.25
138
+
139
+ # Create the new directories
140
+ private_val.mkdir(exist_ok=True, parents=True)
141
+ public_val.mkdir(exist_ok=True, parents=True)
142
+ (public_val / "images").mkdir(exist_ok=True)
143
+
144
+ # Copy images for the validation split. The source is the `public/images`
145
+ # directory created in the first stage.
146
+ for source_filename_stem, dest_filename_stem in tqdm(
147
+ source_to_dest_val_map.items(), desc="Copying over validation images"
148
+ ):
149
+ shutil.copyfile(
150
+ src=public / "images" / f"{source_filename_stem}.jpg",
151
+ dst=public_val / "images" / f"{dest_filename_stem}.jpg",
152
+ )
153
+
154
+ # Save all files for the validation split into the new directories
155
+ answers_val.to_csv(private_val / "test.csv", index=False)
156
+ sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
157
+ test_val.to_csv(public_val / "test.csv", index=False)
158
+ train_val.to_csv(public_val / "train.csv", index=False)
159
+
160
+ print("\nValidation set created successfully.")
@@ -0,0 +1,54 @@
1
+ import pandas as pd
2
+ from scipy.sparse import csr_matrix
3
+ from sklearn.metrics import f1_score
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+
9
+ def prepare_for_metric(
10
+ submission: pd.DataFrame, answers: pd.DataFrame
11
+ ) -> tuple[csr_matrix, csr_matrix]:
12
+ """Transforms the submission and answers DataFrames into the required format for grading as sparse matrices."""
13
+
14
+ # Answers checks
15
+ assert (
16
+ "image" in answers.columns and "labels" in answers.columns
17
+ ), "Answers DataFrame must have 'image' and 'labels' columns"
18
+
19
+ # Submission checks
20
+ if "image" not in submission.columns:
21
+ raise InvalidSubmissionError("Submission DataFrame must have an 'image' column.")
22
+ if "labels" not in submission.columns:
23
+ raise InvalidSubmissionError("Submission DataFrame must have a 'labels' column.")
24
+ if len(submission) != len(answers):
25
+ raise InvalidSubmissionError(
26
+ "Submission and answers DataFrames must have the same number of rows."
27
+ )
28
+
29
+ # Match order
30
+ submission = submission.sort_values(by="image").reset_index(drop=True)
31
+ answers = answers.sort_values(by="image").reset_index(drop=True)
32
+ if not all(submission["image"] == answers["image"]):
33
+ raise InvalidSubmissionError(
34
+ "Submission and answers DataFrames must have matching 'image' columns."
35
+ )
36
+
37
+ # Get classes
38
+ classes = set(answers["labels"].str.split().explode().unique())
39
+
40
+ # fillna with empty string
41
+ answers["labels"] = answers["labels"].fillna("")
42
+ submission["labels"] = submission["labels"].fillna("")
43
+
44
+ # Convert to sparse matrices using MultiLabelBinarizer
45
+ mlb = MultiLabelBinarizer(classes=sorted(classes), sparse_output=True)
46
+ y_true = mlb.fit_transform(answers["labels"].str.split())
47
+ y_pred = mlb.transform(submission["labels"].str.split())
48
+
49
+ return y_true, y_pred
50
+
51
+
52
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
53
+ y_true, y_pred = prepare_for_metric(submission, answers)
54
+ return f1_score(y_true=y_true, y_pred=y_pred, average="micro")
@@ -0,0 +1,65 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import extract, read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+ """
12
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
13
+ """
14
+
15
+ # Create train, test from train split
16
+ old_train = read_csv(raw / "train.csv")
17
+ new_train, answers = train_test_split(old_train, test_size=0.2, random_state=0)
18
+
19
+ # Create a sample submission file
20
+ submission_df = answers.copy()
21
+ submission_df["labels"] = "healthy"
22
+
23
+ # Checks
24
+ assert len(answers) == len(submission_df), "Answers and submission should have the same length"
25
+ assert not set(new_train["image"]).intersection(
26
+ set(answers["image"])
27
+ ), "new_train and answers should not share any image"
28
+ assert (
29
+ "image" in new_train.columns and "labels" in new_train.columns
30
+ ), "Train DataFrame must have 'image' and 'labels' columns"
31
+ assert (
32
+ "image" in submission_df.columns and "labels" in submission_df.columns
33
+ ), "Sample submission DataFrame must have 'image' and 'labels' columns"
34
+ assert len(new_train) + len(answers) == len(
35
+ old_train
36
+ ), "The combined length of new_train and answers should equal the length of old_train"
37
+
38
+ # Write CSVs
39
+ answers.to_csv(private / "answers.csv", index=False)
40
+ new_train.to_csv(public / "train.csv", index=False)
41
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
42
+
43
+ # Copy files
44
+ (public / "test_images").mkdir(exist_ok=True)
45
+ (public / "train_images").mkdir(exist_ok=True)
46
+
47
+ for file_id in tqdm(new_train["image"], desc="Copying Train Images"):
48
+ shutil.copyfile(
49
+ src=raw / "train_images" / f"{file_id}",
50
+ dst=public / "train_images" / f"{file_id}",
51
+ )
52
+
53
+ for file_id in tqdm(answers["image"], desc="Copying Test Images"):
54
+ shutil.copyfile(
55
+ src=raw / "train_images" / f"{file_id}",
56
+ dst=public / "test_images" / f"{file_id}",
57
+ )
58
+
59
+ # Checks
60
+ assert len(list(public.glob("train_images/*.jpg"))) == len(
61
+ new_train
62
+ ), "Public train images should have the same number of images as the train DataFrame"
63
+ assert len(list(public.glob("test_images/*.jpg"))) == len(
64
+ answers
65
+ ), "Public test images should have the same number of images as the answers DataFrame"