dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,118 @@
1
+ """
2
+ Adapted from: https://www.kaggle.com/code/metric/kullback-leibler-divergence
3
+ Linked from: https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification
4
+ """
5
+
6
+ from typing import Optional
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import pandas.api.types
11
+
12
+ from . import kaggle_metric_utilities
13
+
14
+
15
+ class ParticipantVisibleError(Exception):
16
+ pass
17
+
18
+
19
+ def kl_divergence(
20
+ solution: pd.DataFrame,
21
+ submission: pd.DataFrame,
22
+ epsilon: float,
23
+ micro_average: bool,
24
+ sample_weights: Optional[pd.Series],
25
+ ):
26
+ # Overwrite solution for convenience
27
+ for col in solution.columns:
28
+ # Prevent issue with populating int columns with floats
29
+ if not pandas.api.types.is_float_dtype(solution[col]):
30
+ solution[col] = solution[col].astype(float)
31
+
32
+ # Clip both the min and max following Kaggle conventions for related metrics like log loss
33
+ # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
34
+ # prevents users from playing games with the 20th decimal place of predictions.
35
+ submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)
36
+
37
+ y_nonzero_indices = solution[col] != 0
38
+ solution[col] = solution[col].astype(float)
39
+ solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(
40
+ solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col]
41
+ )
42
+ # Set the loss equal to zero where y_true equals zero following the scipy convention:
43
+ # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr
44
+ solution.loc[~y_nonzero_indices, col] = 0
45
+
46
+ if micro_average:
47
+ return np.average(solution.sum(axis=1), weights=sample_weights)
48
+ else:
49
+ return np.average(solution.mean())
50
+
51
+
52
+ def score(
53
+ solution: pd.DataFrame,
54
+ submission: pd.DataFrame,
55
+ row_id_column_name: str,
56
+ epsilon: float = 10**-15,
57
+ micro_average: bool = True,
58
+ sample_weights_column_name: Optional[str] = None,
59
+ ) -> float:
60
+ """The Kullback–Leibler divergence.
61
+ The KL divergence is technically undefined/infinite where the target equals zero.
62
+
63
+ This implementation always assigns those cases a score of zero; effectively removing them from consideration.
64
+ The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces
65
+ another prediction where y > 0, so crucially there is an important indirect effect.
66
+
67
+ https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
68
+
69
+ solution: pd.DataFrame
70
+ submission: pd.DataFrame
71
+ epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p).
72
+ row_id_column_name: str
73
+ micro_average: bool. Row-wise average if True, column-wise average if False.
74
+
75
+ Examples
76
+ --------
77
+ >>> import pandas as pd
78
+ >>> row_id_column_name = "id"
79
+ >>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)
80
+ 0.216161...
81
+ >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
82
+ >>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
83
+ >>> score(solution, submission, 'id')
84
+ 0.0
85
+ >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
86
+ >>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]})
87
+ >>> score(solution, submission, 'id')
88
+ 0.160531...
89
+ """
90
+ del solution[row_id_column_name]
91
+ del submission[row_id_column_name]
92
+
93
+ sample_weights = None
94
+ if sample_weights_column_name:
95
+ if sample_weights_column_name not in solution.columns:
96
+ raise ParticipantVisibleError(
97
+ f"{sample_weights_column_name} not found in solution columns"
98
+ )
99
+ sample_weights = solution.pop(sample_weights_column_name)
100
+
101
+ if sample_weights_column_name and not micro_average:
102
+ raise ParticipantVisibleError("Sample weights are only valid if `micro_average` is `True`")
103
+
104
+ for col in solution.columns:
105
+ if col not in submission.columns:
106
+ raise ParticipantVisibleError(f"Missing submission column {col}")
107
+
108
+ kaggle_metric_utilities.verify_valid_probabilities(solution, "solution")
109
+ kaggle_metric_utilities.verify_valid_probabilities(submission, "submission")
110
+
111
+ return kaggle_metric_utilities.safe_call_score(
112
+ kl_divergence,
113
+ solution,
114
+ submission,
115
+ epsilon=epsilon,
116
+ micro_average=micro_average,
117
+ sample_weights=sample_weights,
118
+ )
@@ -0,0 +1,121 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+ from .constants import TARGET_COLS
10
+
11
+
12
+ def prepare(raw: Path, public: Path, private: Path):
13
+ old_train = read_csv(raw / "train.csv")
14
+
15
+ # split based on `spectrogram_id`
16
+ # this is coarser than `eeg_id` which is coarser than `label_id`, so we avoid data leakage
17
+
18
+ train_spectrograms, test_specrograms = train_test_split(
19
+ old_train["spectrogram_id"].unique(), test_size=0.1, random_state=0
20
+ )
21
+
22
+ new_train = old_train[old_train["spectrogram_id"].isin(train_spectrograms)]
23
+ new_test = old_train[old_train["spectrogram_id"].isin(test_specrograms)]
24
+ new_train.to_csv(public / "train.csv", index=False)
25
+ new_test.to_csv(private / "test.csv", index=False)
26
+
27
+ new_test_without_labels = new_test.copy()[["spectrogram_id", "eeg_id", "patient_id"]]
28
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
29
+
30
+ gold_submission = new_test.copy()[["eeg_id"] + TARGET_COLS]
31
+ # make the votes into probabilities naively
32
+ # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468705#2606605
33
+ gold_submission[TARGET_COLS] = gold_submission[TARGET_COLS].div(
34
+ gold_submission[TARGET_COLS].sum(axis=1), axis=0
35
+ )
36
+ gold_submission.to_csv(private / "gold_submission.csv", index=False)
37
+
38
+ sample_submission = gold_submission.copy()
39
+ sample_submission[TARGET_COLS] = 1 / len(TARGET_COLS)
40
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
41
+
42
+ shutil.copytree(raw / "example_figures", public / "example_figures")
43
+
44
+ (public / "train_eegs").mkdir(parents=True, exist_ok=True)
45
+ for eeg_id in tqdm(
46
+ new_train["eeg_id"].unique(), desc="Train EEGs", total=len(new_train["eeg_id"].unique())
47
+ ):
48
+ shutil.copy(
49
+ raw / "train_eegs" / f"{eeg_id}.parquet", public / "train_eegs" / f"{eeg_id}.parquet"
50
+ )
51
+ (public / "test_eegs").mkdir(parents=True, exist_ok=True)
52
+ for eeg_id in tqdm(
53
+ new_test["eeg_id"].unique(), desc="Test EEGs", total=len(new_test["eeg_id"].unique())
54
+ ):
55
+ shutil.copy(
56
+ raw / "train_eegs" / f"{eeg_id}.parquet", public / "test_eegs" / f"{eeg_id}.parquet"
57
+ )
58
+
59
+ (public / "train_spectrograms").mkdir(parents=True, exist_ok=True)
60
+ for spectrogram_id in tqdm(
61
+ new_train["spectrogram_id"].unique(),
62
+ desc="Train Spectrograms",
63
+ total=len(new_train["spectrogram_id"].unique()),
64
+ ):
65
+ shutil.copy(
66
+ raw / "train_spectrograms" / f"{spectrogram_id}.parquet",
67
+ public / "train_spectrograms" / f"{spectrogram_id}.parquet",
68
+ )
69
+ (public / "test_spectrograms").mkdir(parents=True, exist_ok=True)
70
+ for spectrogram_id in tqdm(
71
+ new_test["spectrogram_id"].unique(),
72
+ desc="Test Spectrograms",
73
+ total=len(new_test["spectrogram_id"].unique()),
74
+ ):
75
+ shutil.copy(
76
+ raw / "train_spectrograms" / f"{spectrogram_id}.parquet",
77
+ public / "test_spectrograms" / f"{spectrogram_id}.parquet",
78
+ )
79
+
80
+ assert len(list((public / "train_eegs").rglob("*"))) == len(
81
+ new_train["eeg_id"].unique()
82
+ ), "Unexpected number of train EEGs Copied"
83
+ assert len(list((public / "test_eegs").rglob("*"))) == len(
84
+ new_test["eeg_id"].unique()
85
+ ), "Unexpected number of test EEGs Copied"
86
+
87
+ assert len(list((public / "train_spectrograms").rglob("*"))) == len(
88
+ train_spectrograms
89
+ ), "Unexpected number of train Spectrograms Copied"
90
+ assert len(list((public / "test_spectrograms").rglob("*"))) == len(
91
+ test_specrograms
92
+ ), "Unexpected number of test Spectrograms Copied"
93
+
94
+ assert set(new_train.spectrogram_id).isdisjoint(
95
+ set(new_test.spectrogram_id)
96
+ ), "Some spectrogram_ids are in both train and test"
97
+ assert set(new_train.eeg_id).isdisjoint(
98
+ set(new_test.eeg_id)
99
+ ), "Some eeg_ids are in both train and test"
100
+
101
+ assert (
102
+ new_train.columns.tolist() == new_test.columns.tolist()
103
+ ), "Columns mismatch between public train and private test"
104
+ assert len(new_train.columns) == 15, "Unexpected number of columns in public train"
105
+ assert len(new_test.columns) == 15, "Unexpected number of columns in private test"
106
+ assert len(sample_submission.columns) == 7, "Unexpected number of columns in sample submission"
107
+ assert len(gold_submission.columns) == 7, "Unexpected number of columns in gold submission"
108
+ assert len(new_test_without_labels.columns) == 3, "Unexpected number of columns in private test"
109
+
110
+ assert len(new_train) + len(new_test) == len(
111
+ old_train
112
+ ), "Expected train + test length to be equal to original train length"
113
+ assert len(new_test_without_labels) == len(
114
+ new_test
115
+ ), "Length mismatch between public test and private test"
116
+ assert len(sample_submission) == len(
117
+ new_test
118
+ ), "Length mismatch between sample submission and private test"
119
+ assert len(gold_submission) == len(
120
+ new_test
121
+ ), "Length mismatch between gold submission and private test"
@@ -0,0 +1,190 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm.auto import tqdm
7
+
8
+ from mlebench.utils import read_csv
9
+
10
+ from .constants import TARGET_COLS
11
+
12
+
13
+ def _process_split(
14
+ train_df: pd.DataFrame,
15
+ test_df: pd.DataFrame,
16
+ public_dir: Path,
17
+ private_dir: Path,
18
+ raw_dir: Path,
19
+ ):
20
+ """
21
+ Helper function to process a single train/test split and save all required files.
22
+ This ensures that the logic for creating the main dataset and the validation dataset is identical.
23
+ """
24
+ public_dir.mkdir(parents=True, exist_ok=True)
25
+ private_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ # Save main data CSVs
28
+ train_df.to_csv(public_dir / "train.csv", index=False)
29
+ test_df.to_csv(private_dir / "test.csv", index=False)
30
+
31
+ # Save public test CSV (without labels)
32
+ test_without_labels = test_df.copy()[["spectrogram_id", "eeg_id", "patient_id"]]
33
+ test_without_labels.to_csv(public_dir / "test.csv", index=False)
34
+
35
+ # Create and save submission files
36
+ gold_submission = test_df.copy()[["eeg_id"] + TARGET_COLS]
37
+ # make the votes into probabilities naively
38
+ # https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468705#2606605
39
+ gold_submission[TARGET_COLS] = gold_submission[TARGET_COLS].div(
40
+ gold_submission[TARGET_COLS].sum(axis=1), axis=0
41
+ )
42
+ gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
43
+
44
+ sample_submission = gold_submission.copy()
45
+ sample_submission[TARGET_COLS] = 1 / len(TARGET_COLS)
46
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
47
+
48
+ # Copy EEG files
49
+ (public_dir / "train_eegs").mkdir(parents=True, exist_ok=True)
50
+ for eeg_id in tqdm(
51
+ train_df["eeg_id"].unique(),
52
+ desc=f"Train EEGs ({public_dir.name})",
53
+ total=len(train_df["eeg_id"].unique()),
54
+ ):
55
+ shutil.copy(
56
+ raw_dir / "train_eegs" / f"{eeg_id}.parquet",
57
+ public_dir / "train_eegs" / f"{eeg_id}.parquet",
58
+ )
59
+ (public_dir / "test_eegs").mkdir(parents=True, exist_ok=True)
60
+ for eeg_id in tqdm(
61
+ test_df["eeg_id"].unique(),
62
+ desc=f"Test EEGs ({public_dir.name})",
63
+ total=len(test_df["eeg_id"].unique()),
64
+ ):
65
+ shutil.copy(
66
+ raw_dir / "train_eegs" / f"{eeg_id}.parquet",
67
+ public_dir / "test_eegs" / f"{eeg_id}.parquet",
68
+ )
69
+
70
+ # Copy Spectrogram files
71
+ (public_dir / "train_spectrograms").mkdir(parents=True, exist_ok=True)
72
+ for spectrogram_id in tqdm(
73
+ train_df["spectrogram_id"].unique(),
74
+ desc=f"Train Spectrograms ({public_dir.name})",
75
+ total=len(train_df["spectrogram_id"].unique()),
76
+ ):
77
+ shutil.copy(
78
+ raw_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
79
+ public_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
80
+ )
81
+ (public_dir / "test_spectrograms").mkdir(parents=True, exist_ok=True)
82
+ for spectrogram_id in tqdm(
83
+ test_df["spectrogram_id"].unique(),
84
+ desc=f"Test Spectrograms ({public_dir.name})",
85
+ total=len(test_df["spectrogram_id"].unique()),
86
+ ):
87
+ shutil.copy(
88
+ raw_dir / "train_spectrograms" / f"{spectrogram_id}.parquet",
89
+ public_dir / "test_spectrograms" / f"{spectrogram_id}.parquet",
90
+ )
91
+
92
+ # Assertions for data integrity
93
+ assert len(list((public_dir / "train_eegs").rglob("*"))) == len(
94
+ train_df["eeg_id"].unique()
95
+ ), "Unexpected number of train EEGs Copied"
96
+ assert len(list((public_dir / "test_eegs").rglob("*"))) == len(
97
+ test_df["eeg_id"].unique()
98
+ ), "Unexpected number of test EEGs Copied"
99
+
100
+ assert len(list((public_dir / "train_spectrograms").rglob("*"))) == len(
101
+ train_df["spectrogram_id"].unique()
102
+ ), "Unexpected number of train Spectrograms Copied"
103
+ assert len(list((public_dir / "test_spectrograms").rglob("*"))) == len(
104
+ test_df["spectrogram_id"].unique()
105
+ ), "Unexpected number of test Spectrograms Copied"
106
+
107
+ assert set(train_df.spectrogram_id).isdisjoint(
108
+ set(test_df.spectrogram_id)
109
+ ), "Some spectrogram_ids are in both train and test"
110
+ assert set(train_df.eeg_id).isdisjoint(
111
+ set(test_df.eeg_id)
112
+ ), "Some eeg_ids are in both train and test"
113
+
114
+ assert (
115
+ train_df.columns.tolist() == test_df.columns.tolist()
116
+ ), "Columns mismatch between public train and private test"
117
+ assert len(train_df.columns) == 15, "Unexpected number of columns in public train"
118
+ assert len(test_df.columns) == 15, "Unexpected number of columns in private test"
119
+ assert len(sample_submission.columns) == 7, "Unexpected number of columns in sample submission"
120
+ assert len(gold_submission.columns) == 7, "Unexpected number of columns in gold submission"
121
+ assert len(test_without_labels.columns) == 3, "Unexpected number of columns in private test"
122
+
123
+ assert len(test_without_labels) == len(
124
+ test_df
125
+ ), "Length mismatch between public test and private test"
126
+ assert len(sample_submission) == len(
127
+ test_df
128
+ ), "Length mismatch between sample submission and private test"
129
+ assert len(gold_submission) == len(
130
+ test_df
131
+ ), "Length mismatch between gold submission and private test"
132
+
133
+
134
+ def prepare(raw: Path, public: Path, private: Path):
135
+ old_train = read_csv(raw / "train.csv")
136
+
137
+ # === Main Data Split (Train/Test) ===
138
+ # This split produces the primary competition data.
139
+
140
+ # split based on `spectrogram_id`
141
+ # this is coarser than `eeg_id` which is coarser than `label_id`, so we avoid data leakage
142
+ train_spectrograms, test_specrograms = train_test_split(
143
+ old_train["spectrogram_id"].unique(), test_size=0.1, random_state=0
144
+ )
145
+
146
+ new_train = old_train[old_train["spectrogram_id"].isin(train_spectrograms)]
147
+ new_test = old_train[old_train["spectrogram_id"].isin(test_specrograms)]
148
+
149
+ # Process and save the main split to the `public` and `private` directories
150
+ _process_split(new_train, new_test, public, private, raw)
151
+
152
+ # Copy shared assets only to the main public directory
153
+ shutil.copytree(raw / "example_figures", public / "example_figures")
154
+
155
+ # Assertions for the main split (post-processing)
156
+ assert len(new_train) + len(new_test) == len(
157
+ old_train
158
+ ), "Expected train + test length to be equal to original train length"
159
+
160
+ # === Validation Data Split ===
161
+ # This second split takes the main training set (`new_train`) and splits it
162
+ # again to create a new, smaller training set and a validation set.
163
+ # The outputs are saved to parallel `public_val` and `private_val` directories.
164
+
165
+ # Define paths for the validation split output
166
+ public_val = public.parent / "public_val"
167
+ private_val = private.parent / "private_val"
168
+
169
+ # The original test set was 10% of the total data. The main training set is 90%.
170
+ # To get a validation set of the same size (10% of total), we need to take
171
+ # 1/9th of the main training set (1/9 * 90% = 10%).
172
+ val_test_size = 1 / 9
173
+
174
+ train_val_spectrograms, test_val_specrograms = train_test_split(
175
+ new_train["spectrogram_id"].unique(), test_size=val_test_size, random_state=0
176
+ )
177
+
178
+ train_val_df = new_train[new_train["spectrogram_id"].isin(train_val_spectrograms)]
179
+ test_val_df = new_train[new_train["spectrogram_id"].isin(test_val_specrograms)]
180
+
181
+ # Process and save the validation split using the same logic as the main split
182
+ _process_split(train_val_df, test_val_df, public_val, private_val, raw)
183
+
184
+ # Copy shared assets to the validation public directory to mirror the main one
185
+ shutil.copytree(raw / "example_figures", public_val / "example_figures")
186
+
187
+ # Assertions for the validation split (post-processing)
188
+ assert len(train_val_df) + len(test_val_df) == len(
189
+ new_train
190
+ ), "Expected train_val + test_val length to be equal to new_train length"
@@ -0,0 +1,41 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+ from mlebench.metrics import mean_average_precision_at_k
6
+
7
+
8
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
9
+ id_col = "image"
10
+ target_col = "hotel_id"
11
+
12
+ if id_col not in submission.columns:
13
+ raise InvalidSubmissionError(f"Submission is missing `{id_col}` column")
14
+ if target_col not in submission.columns:
15
+ raise InvalidSubmissionError(f"Submission is missing `{target_col}` column")
16
+ assert id_col in answers.columns, f"Answers is missing `{id_col}` column"
17
+ assert target_col in answers.columns, f"Answers is missing `{target_col}` column"
18
+
19
+ if not set(submission[id_col]) == set(answers[id_col]):
20
+ raise InvalidSubmissionError("Submission and answers have different ids")
21
+
22
+ if not len(submission) == len(answers):
23
+ raise InvalidSubmissionError("Submission and answers have different lengths")
24
+
25
+ # sort to ensure we're grading the right rows
26
+ submission = submission.sort_values(id_col).reset_index(drop=True)
27
+ answers = answers.sort_values(id_col).reset_index(drop=True)
28
+
29
+ # split `hotel_id` column into list of strings
30
+ submission[target_col] = submission[target_col].astype(str).str.split(" ")
31
+ answers[target_col] = answers[target_col].astype(str).str.split(" ").apply(set)
32
+
33
+ actual = answers[target_col].tolist()
34
+ predicted = submission[target_col].tolist()
35
+
36
+ return {"actual": actual, "predicted": predicted}
37
+
38
+
39
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
40
+ map_inputs = prepare_for_metric(submission, answers)
41
+ return mean_average_precision_at_k(**map_inputs, k=5)
@@ -0,0 +1,63 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+ old_train = read_csv(raw / "train.csv")
12
+ # drop image ce27d36c9147cc19.jpg: it appears twice and may occur across train and test when split
13
+ old_train = old_train[old_train["image"] != "ce27d36c9147cc19.jpg"]
14
+
15
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
16
+ sample_submission = new_test.copy()[["image", "hotel_id"]]
17
+ sample_submission["hotel_id"] = "36363 53586 18807 64314 60181"
18
+
19
+ # save public files
20
+ new_train.to_csv(public / "train.csv", index=False)
21
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
22
+
23
+ # copy images from raw to prepared/public accordingly
24
+ (public / "train_images").mkdir(exist_ok=True, parents=True)
25
+ for image, chain in tqdm(
26
+ zip(new_train["image"], new_train["chain"]), total=len(new_train), desc="Train images"
27
+ ):
28
+ chain = str(chain)
29
+ (public / "train_images" / chain).mkdir(exist_ok=True, parents=True)
30
+ shutil.copy(raw / "train_images" / chain / image, public / "train_images" / chain / image)
31
+
32
+ (public / "test_images").mkdir(exist_ok=True, parents=True)
33
+ for image, chain in tqdm(
34
+ zip(new_test["image"], new_test["chain"]), total=len(new_test), desc="Test images"
35
+ ):
36
+ chain = str(chain)
37
+ (public / "test_images").mkdir(exist_ok=True, parents=True)
38
+ shutil.copy(raw / "train_images" / chain / image, public / "test_images" / image)
39
+
40
+ # save private files
41
+ new_test.to_csv(private / "test.csv", index=False)
42
+
43
+ # checks
44
+ assert len(new_train) + len(new_test) == len(
45
+ old_train
46
+ ), "Length of new_train and new_test should equal length of old_train"
47
+ assert sample_submission.columns.to_list() == [
48
+ "image",
49
+ "hotel_id",
50
+ ], "Sample submission columns should only be `image` and `hotel_id`"
51
+ assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
52
+ for image, chain in zip(new_train["image"], new_train["chain"]):
53
+ chain = str(chain)
54
+ assert (
55
+ public / "train_images" / chain / image
56
+ ).exists(), f"Image {image} not found in train_images folder"
57
+ for image in new_test["image"]:
58
+ assert (
59
+ public / "test_images" / image
60
+ ).exists(), f"Image {image} not found in test_images folder"
61
+ assert not set(new_train["image"]).intersection(
62
+ set(new_test["image"])
63
+ ), "Train and test ids overlap"
@@ -0,0 +1,132 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def _create_split(
11
+ df_to_split,
12
+ raw_images_base_path: Path,
13
+ public_path: Path,
14
+ private_path: Path,
15
+ test_size: float,
16
+ random_state: int,
17
+ ):
18
+ """
19
+ Helper function to perform a data split, create corresponding files, and copy images.
20
+
21
+ Args:
22
+ df_to_split: The DataFrame to be split into train and test sets.
23
+ raw_images_base_path: Path to the directory containing the original images.
24
+ public_path: The public output directory for this split.
25
+ private_path: The private output directory for this split.
26
+ test_size: The proportion of the dataset to allocate to the test split.
27
+ random_state: The seed used by the random number generator.
28
+ """
29
+ # Create output directories
30
+ public_path.mkdir(exist_ok=True, parents=True)
31
+ private_path.mkdir(exist_ok=True, parents=True)
32
+
33
+ # Perform the split
34
+ new_train, new_test = train_test_split(
35
+ df_to_split, test_size=test_size, random_state=random_state
36
+ )
37
+ sample_submission = new_test.copy()[["image", "hotel_id"]]
38
+ sample_submission["hotel_id"] = "36363 53586 18807 64314 60181"
39
+
40
+ # Save public files
41
+ new_train.to_csv(public_path / "train.csv", index=False)
42
+ sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
43
+
44
+ # Copy images from raw to prepared/public accordingly
45
+ (public_path / "train_images").mkdir(exist_ok=True, parents=True)
46
+ for image, chain in tqdm(
47
+ zip(new_train["image"], new_train["chain"]),
48
+ total=len(new_train),
49
+ desc=f"Train images for {public_path.name}",
50
+ ):
51
+ chain = str(chain)
52
+ (public_path / "train_images" / chain).mkdir(exist_ok=True, parents=True)
53
+ shutil.copy(
54
+ raw_images_base_path / chain / image,
55
+ public_path / "train_images" / chain / image,
56
+ )
57
+
58
+ (public_path / "test_images").mkdir(exist_ok=True, parents=True)
59
+ for image, chain in tqdm(
60
+ zip(new_test["image"], new_test["chain"]),
61
+ total=len(new_test),
62
+ desc=f"Test images for {public_path.name}",
63
+ ):
64
+ chain = str(chain)
65
+ # Note: Test images are copied to a flat directory structure
66
+ shutil.copy(raw_images_base_path / chain / image, public_path / "test_images" / image)
67
+
68
+ # Save private files
69
+ new_test.to_csv(private_path / "test.csv", index=False)
70
+
71
+ # Checks
72
+ assert len(new_train) + len(new_test) == len(
73
+ df_to_split
74
+ ), "Length of new_train and new_test should equal length of input dataframe"
75
+ assert sample_submission.columns.to_list() == [
76
+ "image",
77
+ "hotel_id",
78
+ ], "Sample submission columns should only be `image` and `hotel_id`"
79
+ assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
80
+ for image, chain in zip(new_train["image"], new_train["chain"]):
81
+ chain = str(chain)
82
+ assert (
83
+ public_path / "train_images" / chain / image
84
+ ).exists(), f"Image {image} not found in train_images folder"
85
+ for image in new_test["image"]:
86
+ assert (
87
+ public_path / "test_images" / image
88
+ ).exists(), f"Image {image} not found in test_images folder"
89
+ assert not set(new_train["image"]).intersection(
90
+ set(new_test["image"])
91
+ ), "Train and test ids overlap"
92
+
93
+ return new_train, new_test
94
+
95
+
96
+ def prepare(raw: Path, public: Path, private: Path):
97
+ old_train = read_csv(raw / "train.csv")
98
+ # drop image ce27d36c9147cc19.jpg: it appears twice and may occur across train and test when split
99
+ old_train = old_train[old_train["image"] != "ce27d36c9147cc19.jpg"]
100
+
101
+ # --- First Split (Original Public/Private) ---
102
+ # This split produces the main benchmark data. The outputs in `public` and `private`
103
+ # will be identical to the original script's output.
104
+ original_train_df, _ = _create_split(
105
+ df_to_split=old_train,
106
+ raw_images_base_path=raw / "train_images",
107
+ public_path=public,
108
+ private_path=private,
109
+ test_size=0.1,
110
+ random_state=0,
111
+ )
112
+
113
+ # --- Second Split (New Validation Set) ---
114
+ # This split takes the training data from the first split (`original_train_df`) and
115
+ # splits it again to create a new, smaller training set and a validation set.
116
+ # The results are saved in parallel `public_val` and `private_val` directories.
117
+ public_val = public.parent / "public_val"
118
+ private_val = private.parent / "private_val"
119
+
120
+ # The test size is 1/9 because we want the new validation set to be roughly the
121
+ # same size as the original test set (10% of the total data).
122
+ # original_train_df is 90% of the total, so 1/9 of it is 10% of the total.
123
+ test_size_for_val_split = 1 / 9
124
+
125
+ _create_split(
126
+ df_to_split=original_train_df,
127
+ raw_images_base_path=raw / "train_images",
128
+ public_path=public_val,
129
+ private_path=private_val,
130
+ test_size=test_size_for_val_split,
131
+ random_state=0, # Use same random state for consistency and determinism
132
+ )