dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,92 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm import tqdm
6
+
7
+ from mlebench.utils import extract, get_logger, read_csv
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def prepare(raw: Path, public: Path, private: Path):
13
+ """
14
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
15
+ """
16
+ # Extract images so we can split the train images
17
+ extract(raw / "train_images.zip", raw / "train")
18
+
19
+ # Create train, test from train split
20
+ old_train = read_csv(raw / "train.csv")
21
+
22
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
23
+
24
+ # Save the new train and test
25
+ new_train.to_csv(public / "train.csv", index=False)
26
+ new_test.to_csv(private / "test.csv", index=False)
27
+
28
+ # Copy images
29
+ (public / "train").mkdir(exist_ok=True)
30
+ (public / "test").mkdir(exist_ok=True)
31
+
32
+ for file_id in tqdm(new_train["image_id"], desc="Copying train images"):
33
+ shutil.copyfile(
34
+ src=raw / "train" / f"{file_id}.jpg",
35
+ dst=public / "train" / f"{file_id}.jpg",
36
+ )
37
+
38
+ for file_id in tqdm(new_test["image_id"], desc="Copying test images"):
39
+ shutil.copyfile(
40
+ src=raw / "train" / f"{file_id}.jpg",
41
+ dst=public / "test" / f"{file_id}.jpg",
42
+ )
43
+
44
+ assert len(list(public.glob("train/*.jpg"))) == len(new_train)
45
+ assert len(list(public.glob("test/*.jpg"))) == len(new_test)
46
+ assert len(new_train) + len(new_test) == len(old_train)
47
+
48
+ # Create zips of the images
49
+ logger.info("Re-zipping up new image directories...")
50
+ shutil.make_archive(str(public / "train_images"), "zip", public / "train")
51
+ shutil.make_archive(str(public / "test_images"), "zip", public / "test")
52
+ # Remove the directories for consistency with the kaggle data
53
+ shutil.rmtree(public / "train")
54
+ shutil.rmtree(public / "test")
55
+
56
+ # Copy unicode_translation
57
+ shutil.copyfile(
58
+ src=raw / "unicode_translation.csv",
59
+ dst=public / "unicode_translation.csv",
60
+ )
61
+
62
+ assert (public / "train_images.zip").is_file()
63
+ assert (public / "test_images.zip").is_file()
64
+ assert (public / "unicode_translation.csv").is_file()
65
+
66
+ # Make sample submission for new test set
67
+ sample_submission = new_test.copy()
68
+ # Same guess for all, as in original sample submission
69
+ sample_submission["labels"] = "U+003F 1 1 U+FF2F 2 2"
70
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
71
+
72
+ # Make a gold submission in private/ (useful for testing)
73
+ # This submission takes the "class x y w h" labels from test and converts them to
74
+ # "class x+1 y+1" labels (the +1 moves the coord into the bbox, so that the metric picks it up)
75
+ submission_labels = []
76
+ for label in new_test["labels"]:
77
+ # Labels have the form "class x y w h class x y w h class x y w h ... "
78
+ label = label.split()
79
+ new_label = []
80
+ assert len(label) % 5 == 0
81
+ classes, xs, ys = label[0::5], label[1::5], label[2::5]
82
+ # +1 to xs and ys
83
+ xs = [str(int(x) + 1) for x in xs]
84
+ ys = [str(int(y) + 1) for y in ys]
85
+ new_label = [f"{c} {x} {y}" for c, x, y in zip(classes, xs, ys)]
86
+ submission_labels.append(" ".join(new_label))
87
+ gold_submission = new_test.copy()
88
+ gold_submission["labels"] = submission_labels
89
+ gold_submission.to_csv(private / "gold_submission.csv", index=False)
90
+
91
+ assert len(gold_submission) == len(new_test)
92
+ assert len(gold_submission) == len(sample_submission)
@@ -0,0 +1,149 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING
4
+
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+ from mlebench.utils import extract, get_logger, read_csv
9
+
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def _create_split_files(
17
+ train_df: "pd.DataFrame",
18
+ test_df: "pd.DataFrame",
19
+ public_dir: Path,
20
+ private_dir: Path,
21
+ raw_dir: Path,
22
+ ):
23
+ """
24
+ Helper function to create all necessary files for a given train/test split.
25
+
26
+ This function populates the public and private directories with the respective
27
+ data (images, CSVs, submissions) based on the provided dataframes.
28
+ """
29
+ # Save the new train and test metadata
30
+ train_df.to_csv(public_dir / "train.csv", index=False)
31
+ test_df.to_csv(private_dir / "test.csv", index=False)
32
+
33
+ # Copy images
34
+ (public_dir / "train").mkdir(exist_ok=True)
35
+ (public_dir / "test").mkdir(exist_ok=True)
36
+ raw_img_dir = raw_dir / "train"
37
+
38
+ for file_id in tqdm(train_df["image_id"], desc=f"Copying train images to {public_dir.name}"):
39
+ shutil.copyfile(
40
+ src=raw_img_dir / f"{file_id}.jpg",
41
+ dst=public_dir / "train" / f"{file_id}.jpg",
42
+ )
43
+
44
+ for file_id in tqdm(test_df["image_id"], desc=f"Copying test images to {public_dir.name}"):
45
+ shutil.copyfile(
46
+ src=raw_img_dir / f"{file_id}.jpg",
47
+ dst=public_dir / "test" / f"{file_id}.jpg",
48
+ )
49
+
50
+ assert len(list(public_dir.glob("train/*.jpg"))) == len(train_df)
51
+ assert len(list(public_dir.glob("test/*.jpg"))) == len(test_df)
52
+
53
+ # Create zips of the images
54
+ logger.info(f"Re-zipping up new image directories for {public_dir.name}...")
55
+ shutil.make_archive(str(public_dir / "train_images"), "zip", public_dir / "train")
56
+ shutil.make_archive(str(public_dir / "test_images"), "zip", public_dir / "test")
57
+ # Remove the directories for consistency with the kaggle data
58
+ shutil.rmtree(public_dir / "train")
59
+ shutil.rmtree(public_dir / "test")
60
+
61
+ # Copy unicode_translation
62
+ shutil.copyfile(
63
+ src=raw_dir / "unicode_translation.csv",
64
+ dst=public_dir / "unicode_translation.csv",
65
+ )
66
+
67
+ assert (public_dir / "train_images.zip").is_file()
68
+ assert (public_dir / "test_images.zip").is_file()
69
+ assert (public_dir / "unicode_translation.csv").is_file()
70
+
71
+ # Make sample submission for new test set
72
+ sample_submission = test_df.copy()
73
+ # Same guess for all, as in original sample submission
74
+ sample_submission["labels"] = "U+003F 1 1 U+FF2F 2 2"
75
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
76
+
77
+ # Make a gold submission in private/ (useful for testing)
78
+ # This submission takes the "class x y w h" labels from test and converts them to
79
+ # "class x+1 y+1" labels (the +1 moves the coord into the bbox, so that the metric picks it up)
80
+ submission_labels = []
81
+ for label in test_df["labels"]:
82
+ # Labels have the form "class x y w h class x y w h class x y w h ... "
83
+ label = label.split()
84
+ new_label = []
85
+ assert len(label) % 5 == 0
86
+ classes, xs, ys = label[0::5], label[1::5], label[2::5]
87
+ # +1 to xs and ys
88
+ xs = [str(int(x) + 1) for x in xs]
89
+ ys = [str(int(y) + 1) for y in ys]
90
+ new_label = [f"{c} {x} {y}" for c, x, y in zip(classes, xs, ys)]
91
+ submission_labels.append(" ".join(new_label))
92
+ gold_submission = test_df.copy()
93
+ gold_submission["labels"] = submission_labels
94
+ gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
95
+
96
+ assert len(gold_submission) == len(test_df)
97
+ assert len(gold_submission) == len(sample_submission)
98
+
99
+
100
+ def prepare(raw: Path, public: Path, private: Path):
101
+ """
102
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
103
+ Additionally, creates a second train/validation split for local model development.
104
+ """
105
+ # Extract images so we can split the train images
106
+ extract(raw / "train_images.zip", raw / "train")
107
+
108
+ # Create train, test from train split
109
+ old_train = read_csv(raw / "train.csv")
110
+
111
+ # --- Original Split (for final evaluation) ---
112
+ logger.info("Creating original train/test split for 'public' and 'private' directories...")
113
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
114
+
115
+ _create_split_files(
116
+ train_df=new_train,
117
+ test_df=new_test,
118
+ public_dir=public,
119
+ private_dir=private,
120
+ raw_dir=raw,
121
+ )
122
+ assert len(new_train) + len(new_test) == len(old_train)
123
+ logger.info("Successfully created original split.")
124
+
125
+ # --- New Validation Split (for local development) ---
126
+ logger.info("Creating new train/validation split for 'public_val' and 'private_val' directories...")
127
+ public_val = public.parent / "public_val"
128
+ private_val = private.parent / "private_val"
129
+ public_val.mkdir(exist_ok=True)
130
+ private_val.mkdir(exist_ok=True)
131
+
132
+ # Split the original training data to create a new, smaller training set and a validation set.
133
+ # The validation set size must be approx the same as the original test set size.
134
+ # Original test size = 0.1 * total_data
135
+ # Original train size = 0.9 * total_data
136
+ # To make the new validation set size equal to the original test set size, we must take
137
+ # a fraction of the `new_train` data: test_size * (0.9 * total) = 0.1 * total.
138
+ # This means the required test_size = 0.1 / 0.9 = 1/9.
139
+ train_val, test_val = train_test_split(new_train, test_size=1 / 9, random_state=0)
140
+
141
+ _create_split_files(
142
+ train_df=train_val,
143
+ test_df=test_val,
144
+ public_dir=public_val,
145
+ private_dir=private_val,
146
+ raw_dir=raw,
147
+ )
148
+ assert len(train_val) + len(test_val) == len(new_train)
149
+ logger.info("Successfully created validation split.")
@@ -0,0 +1,101 @@
1
+ CLASSES = [
2
+ "Acer_Capillipes",
3
+ "Acer_Circinatum",
4
+ "Acer_Mono",
5
+ "Acer_Opalus",
6
+ "Acer_Palmatum",
7
+ "Acer_Pictum",
8
+ "Acer_Platanoids",
9
+ "Acer_Rubrum",
10
+ "Acer_Rufinerve",
11
+ "Acer_Saccharinum",
12
+ "Alnus_Cordata",
13
+ "Alnus_Maximowiczii",
14
+ "Alnus_Rubra",
15
+ "Alnus_Sieboldiana",
16
+ "Alnus_Viridis",
17
+ "Arundinaria_Simonii",
18
+ "Betula_Austrosinensis",
19
+ "Betula_Pendula",
20
+ "Callicarpa_Bodinieri",
21
+ "Castanea_Sativa",
22
+ "Celtis_Koraiensis",
23
+ "Cercis_Siliquastrum",
24
+ "Cornus_Chinensis",
25
+ "Cornus_Controversa",
26
+ "Cornus_Macrophylla",
27
+ "Cotinus_Coggygria",
28
+ "Crataegus_Monogyna",
29
+ "Cytisus_Battandieri",
30
+ "Eucalyptus_Glaucescens",
31
+ "Eucalyptus_Neglecta",
32
+ "Eucalyptus_Urnigera",
33
+ "Fagus_Sylvatica",
34
+ "Ginkgo_Biloba",
35
+ "Ilex_Aquifolium",
36
+ "Ilex_Cornuta",
37
+ "Liquidambar_Styraciflua",
38
+ "Liriodendron_Tulipifera",
39
+ "Lithocarpus_Cleistocarpus",
40
+ "Lithocarpus_Edulis",
41
+ "Magnolia_Heptapeta",
42
+ "Magnolia_Salicifolia",
43
+ "Morus_Nigra",
44
+ "Olea_Europaea",
45
+ "Phildelphus",
46
+ "Populus_Adenopoda",
47
+ "Populus_Grandidentata",
48
+ "Populus_Nigra",
49
+ "Prunus_Avium",
50
+ "Prunus_X_Shmittii",
51
+ "Pterocarya_Stenoptera",
52
+ "Quercus_Afares",
53
+ "Quercus_Agrifolia",
54
+ "Quercus_Alnifolia",
55
+ "Quercus_Brantii",
56
+ "Quercus_Canariensis",
57
+ "Quercus_Castaneifolia",
58
+ "Quercus_Cerris",
59
+ "Quercus_Chrysolepis",
60
+ "Quercus_Coccifera",
61
+ "Quercus_Coccinea",
62
+ "Quercus_Crassifolia",
63
+ "Quercus_Crassipes",
64
+ "Quercus_Dolicholepis",
65
+ "Quercus_Ellipsoidalis",
66
+ "Quercus_Greggii",
67
+ "Quercus_Hartwissiana",
68
+ "Quercus_Ilex",
69
+ "Quercus_Imbricaria",
70
+ "Quercus_Infectoria_sub",
71
+ "Quercus_Kewensis",
72
+ "Quercus_Nigra",
73
+ "Quercus_Palustris",
74
+ "Quercus_Phellos",
75
+ "Quercus_Phillyraeoides",
76
+ "Quercus_Pontica",
77
+ "Quercus_Pubescens",
78
+ "Quercus_Pyrenaica",
79
+ "Quercus_Rhysophylla",
80
+ "Quercus_Rubra",
81
+ "Quercus_Semecarpifolia",
82
+ "Quercus_Shumardii",
83
+ "Quercus_Suber",
84
+ "Quercus_Texana",
85
+ "Quercus_Trojana",
86
+ "Quercus_Variabilis",
87
+ "Quercus_Vulcanica",
88
+ "Quercus_x_Hispanica",
89
+ "Quercus_x_Turneri",
90
+ "Rhododendron_x_Russellianum",
91
+ "Salix_Fragilis",
92
+ "Salix_Intergra",
93
+ "Sorbus_Aria",
94
+ "Tilia_Oliveri",
95
+ "Tilia_Platyphyllos",
96
+ "Tilia_Tomentosa",
97
+ "Ulmus_Bergmanniana",
98
+ "Viburnum_Tinus",
99
+ "Viburnum_x_Rhytidophylloides",
100
+ "Zelkova_Serrata",
101
+ ]
@@ -0,0 +1,44 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import log_loss
4
+
5
+ from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+ from .classes import CLASSES
9
+
10
+
11
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
12
+ """
13
+ The submission and answers are already one-hotted
14
+ """
15
+ if set(submission.columns) != {"id"} | set(CLASSES):
16
+ raise InvalidSubmissionError(
17
+ "Submission DataFrame must have an 'id' column and a column for each class."
18
+ )
19
+ if len(submission) != len(answers):
20
+ raise InvalidSubmissionError(
21
+ "Submission and answers DataFrames must have the same number of rows."
22
+ )
23
+ tolerance = 1e-6
24
+ if not np.all(np.isclose(submission[CLASSES].sum(axis=1), 1, atol=tolerance)):
25
+ raise InvalidSubmissionError("Each row in submission should sum to one, as probabilities.")
26
+ if not ((submission[CLASSES] >= 0) & (submission[CLASSES] <= 1)).all().all():
27
+ raise InvalidSubmissionError(
28
+ "All probabilities in submission DataFrame must be between 0 and 1."
29
+ )
30
+
31
+ assert set(answers.columns) == {"id"} | set(
32
+ CLASSES
33
+ ), "Answers DataFrame must have an 'id' column and a column for each class."
34
+
35
+ log_loss_inputs = one_hot_dfs_to_log_loss_inputs(
36
+ submission, answers, id_column="id", apply_softmax=False
37
+ )
38
+
39
+ return log_loss_inputs
40
+
41
+
42
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
43
+ log_loss_inputs = prepare_for_metric(submission, answers)
44
+ return log_loss(**log_loss_inputs)
@@ -0,0 +1,60 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.competitions.utils import df_to_one_hot
7
+ from mlebench.utils import extract, read_csv
8
+
9
+ from .classes import CLASSES
10
+
11
+
12
+ def prepare(raw: Path, public: Path, private: Path):
13
+ """
14
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
15
+ """
16
+ # extract only what we need
17
+ extract(raw / "train.csv.zip", raw)
18
+ extract(raw / "images.zip", raw)
19
+
20
+ # Create train, test from train split
21
+ old_train = read_csv(raw / "train.csv")
22
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
23
+ new_test_without_labels = new_test.drop(columns=["species"])
24
+
25
+ # match the format of the sample submission
26
+ new_test = new_test[["id", "species"]]
27
+ new_test = df_to_one_hot(new_test, "id", "species", classes=CLASSES)
28
+
29
+ (public / "images").mkdir(exist_ok=True)
30
+ (private / "images").mkdir(exist_ok=True)
31
+
32
+ for file_id in new_train["id"]:
33
+ shutil.copyfile(
34
+ src=raw / "images" / f"{file_id}.jpg",
35
+ dst=public / "images" / f"{file_id}.jpg",
36
+ )
37
+
38
+ for file_id in new_test_without_labels["id"]:
39
+ shutil.copyfile(
40
+ src=raw / "images" / f"{file_id}.jpg",
41
+ dst=public / "images" / f"{file_id}.jpg",
42
+ )
43
+
44
+ # Check integrity of the files copied
45
+ assert len(new_test_without_labels) == len(
46
+ new_test
47
+ ), "Public and Private tests should have equal length"
48
+ assert len(list(public.glob("images/*.jpg"))) == len(new_train) + len(
49
+ new_test_without_labels
50
+ ), "Public images should have the same number of images as the sum of train and test"
51
+
52
+ # Create a sample submission file
53
+ submission_df = new_test.copy()
54
+ submission_df[CLASSES] = 1 / len(CLASSES)
55
+
56
+ # Copy over files
57
+ new_train.to_csv(public / "train.csv", index=False)
58
+ new_test.to_csv(private / "test.csv", index=False)
59
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
60
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
@@ -0,0 +1,116 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from pandas import DataFrame
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from mlebench.competitions.utils import df_to_one_hot
8
+ from mlebench.utils import extract, read_csv
9
+
10
+ from .classes import CLASSES
11
+
12
+
13
+ def _create_split_and_save(
14
+ source_df: DataFrame,
15
+ image_source_dir: Path,
16
+ public_dir: Path,
17
+ private_dir: Path,
18
+ test_size: float,
19
+ random_state: int,
20
+ ) -> DataFrame:
21
+ """Helper function to perform a split, copy images, and save all artifacts."""
22
+ # Create train, test from the source dataframe
23
+ new_train, new_test = train_test_split(
24
+ source_df, test_size=test_size, random_state=random_state
25
+ )
26
+ new_test_without_labels = new_test.drop(columns=["species"])
27
+
28
+ # match the format of the sample submission
29
+ new_test = new_test[["id", "species"]]
30
+ new_test = df_to_one_hot(new_test, "id", "species", classes=CLASSES)
31
+
32
+ # Create destination directories
33
+ (public_dir / "images").mkdir(parents=True, exist_ok=True)
34
+ (private_dir / "images").mkdir(parents=True, exist_ok=True)
35
+
36
+ # Copy images for the new training set
37
+ for file_id in new_train["id"]:
38
+ shutil.copyfile(
39
+ src=image_source_dir / f"{file_id}.jpg",
40
+ dst=public_dir / "images" / f"{file_id}.jpg",
41
+ )
42
+
43
+ # Copy images for the new test set
44
+ for file_id in new_test_without_labels["id"]:
45
+ shutil.copyfile(
46
+ src=image_source_dir / f"{file_id}.jpg",
47
+ dst=public_dir / "images" / f"{file_id}.jpg",
48
+ )
49
+
50
+ # Check integrity of the files copied
51
+ assert len(new_test_without_labels) == len(
52
+ new_test
53
+ ), "Public and Private tests should have equal length"
54
+ assert len(list(public_dir.glob("images/*.jpg"))) == len(new_train) + len(
55
+ new_test_without_labels
56
+ ), "Public images should have the same number of images as the sum of train and test"
57
+
58
+ # Create a sample submission file
59
+ submission_df = new_test.copy()
60
+ submission_df[CLASSES] = 1 / len(CLASSES)
61
+
62
+ # Save all dataframes to their respective files
63
+ new_train.to_csv(public_dir / "train.csv", index=False)
64
+ new_test.to_csv(private_dir / "test.csv", index=False)
65
+ new_test_without_labels.to_csv(public_dir / "test.csv", index=False)
66
+ submission_df.to_csv(public_dir / "sample_submission.csv", index=False)
67
+
68
+ return new_train
69
+
70
+
71
+ def prepare(raw: Path, public: Path, private: Path):
72
+ """
73
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
74
+ Also creates a secondary validation split in public_val/private_val directories.
75
+ """
76
+ # extract only what we need
77
+ extract(raw / "train.csv.zip", raw)
78
+ extract(raw / "images.zip", raw)
79
+
80
+ # Load the full raw training data
81
+ full_train_df = read_csv(raw / "train.csv")
82
+ image_source_dir = raw / "images"
83
+
84
+ # --- Stage 1: Create the original train/test split ---
85
+ # This block generates the primary competition data in `public/` and `private/`.
86
+ # The outputs of this stage must remain unchanged from the original script.
87
+ train_df_after_first_split = _create_split_and_save(
88
+ source_df=full_train_df,
89
+ image_source_dir=image_source_dir,
90
+ public_dir=public,
91
+ private_dir=private,
92
+ test_size=0.1,
93
+ random_state=0,
94
+ )
95
+
96
+ # --- Stage 2: Create the new train/validation split ---
97
+ # This block takes the training set from Stage 1 and splits it again
98
+ # to create a new, smaller training set and a validation set.
99
+ # The outputs are saved to new `public_val/` and `private_val/` directories.
100
+
101
+ # Define paths for the new validation split
102
+ public_val = public.parent / "public_val"
103
+ private_val = private.parent / "private_val"
104
+
105
+ # Calculate test_size to make the new validation set have the same number
106
+ # of samples as the original test set (0.1 / (1-0.1) = 0.1 / 0.9)
107
+ validation_test_size = 0.1 / 0.9
108
+
109
+ _create_split_and_save(
110
+ source_df=train_df_after_first_split,
111
+ image_source_dir=image_source_dir,
112
+ public_dir=public_val,
113
+ private_dir=private_val,
114
+ test_size=validation_test_size,
115
+ random_state=0, # Use the same random state for consistency
116
+ )
@@ -0,0 +1,44 @@
1
+ from pandas import DataFrame
2
+ from sklearn.metrics import cohen_kappa_score
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+
6
+
7
+ def prepare_for_grading(
8
+ submission: DataFrame, answers: DataFrame, id_column: str, target_column: str
9
+ ) -> tuple:
10
+
11
+ # Answers checks
12
+ assert (
13
+ target_column in answers.columns
14
+ ), f"Answers must contain the target column '{target_column}'"
15
+ assert id_column in answers.columns, f"Answers must contain the id column '{id_column}'"
16
+ assert not answers.empty, "Answers DataFrame should not be empty"
17
+
18
+ # Submission checks
19
+ if target_column not in submission.columns:
20
+ raise InvalidSubmissionError(f"Submission must contain the target column '{target_column}'")
21
+ if id_column not in submission.columns:
22
+ raise InvalidSubmissionError(f"Submission must contain the id column '{id_column}'")
23
+ if submission.empty:
24
+ raise InvalidSubmissionError("Submission DataFrame should not be empty")
25
+
26
+ # Sort the submissions and answers by id_column
27
+ submission_sorted = submission.sort_values(by=id_column)
28
+ answers_sorted = answers.sort_values(by=id_column)
29
+
30
+ if (submission_sorted[id_column].values != answers_sorted[id_column].values).any():
31
+ raise InvalidSubmissionError("Submission must contain the same ids as the answers")
32
+
33
+ # Extract the target columns
34
+ y_pred = submission_sorted[target_column].values
35
+ y_true = answers_sorted[target_column].values
36
+
37
+ return y_pred, y_true
38
+
39
+
40
+ def grade(submission: DataFrame, answers: DataFrame) -> float:
41
+ y_pred, y_true = prepare_for_grading(
42
+ submission=submission, answers=answers, id_column="essay_id", target_column="score"
43
+ )
44
+ return cohen_kappa_score(y_pred, y_true, weights="quadratic")
@@ -0,0 +1,51 @@
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.utils import read_csv
7
+
8
+
9
+ def prepare(raw: Path, public: Path, private: Path):
10
+ """
11
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
12
+ """
13
+
14
+ # Create train, test from original train split
15
+ old_train = read_csv(raw / "train.csv")
16
+ # Original train has 17307 rows. Original hidden test has approx 8k rows. We just take 10% of the original train as the test set.
17
+ new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
18
+ new_test = answers.drop(columns=["score"])
19
+
20
+ sample_submission = answers[["essay_id"]].copy()
21
+ sample_submission["score"] = np.random.RandomState(42).randint(
22
+ 1, 7, size=len(sample_submission)
23
+ )
24
+
25
+ # Checks
26
+ assert set(new_train["essay_id"]).isdisjoint(
27
+ set(new_test["essay_id"])
28
+ ), "Essay IDs in train and test sets are not disjoint"
29
+ assert len(new_train) + len(new_test) == len(
30
+ old_train
31
+ ), f"Train and test sets do not sum to original train set"
32
+ assert len(new_test) == len(
33
+ sample_submission
34
+ ), f"Test and sample submission sets do not have the same length"
35
+ assert (
36
+ new_train.columns.tolist() == old_train.columns.tolist()
37
+ ), f"Train set columns do not match original train set, got {new_train.columns.tolist()}"
38
+ assert new_test.columns.tolist() == [
39
+ "essay_id",
40
+ "full_text",
41
+ ], f"Test set columns do not match expected columns, got {new_test.columns.tolist()}"
42
+ assert sample_submission.columns.tolist() == [
43
+ "essay_id",
44
+ "score",
45
+ ], f"Sample submission set columns do not match expected columns, got {sample_submission.columns.tolist()}"
46
+
47
+ # Write CSVs
48
+ answers.to_csv(private / "answers.csv", index=False)
49
+ new_train.to_csv(public / "train.csv", index=False)
50
+ new_test.to_csv(public / "test.csv", index=False)
51
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)