dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,155 @@
1
+ import json
2
+ import random
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from tqdm import tqdm
10
+
11
+ from mlebench.competitions.utils import get_logger, rle_encode
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def prepare(raw: Path, public: Path, private: Path):
17
+ """
18
+ We make train/test split from old train set, using same train/test proportion as the original
19
+ competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
20
+ copy over the validation set as-is.
21
+
22
+ `sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
23
+ """
24
+ DEV = False
25
+
26
+ with open(raw / "train_metadata.json", "r") as f:
27
+ train_metadata = json.load(f)
28
+ train_metadata = train_metadata[:100] if DEV else train_metadata
29
+ with open(raw / "validation_metadata.json", "r") as f:
30
+ validation_metadata = json.load(f)
31
+
32
+ if DEV:
33
+ new_train, new_test = train_metadata[:90], train_metadata[90:]
34
+ else:
35
+ new_train, new_test = train_test_split(
36
+ train_metadata, test_size=len(validation_metadata), random_state=0
37
+ )
38
+ logger.info(
39
+ f"Created new split with {len(new_train)} train samples and {len(new_test)} test samples"
40
+ )
41
+
42
+ # Create answers csv containing ground truth masks, heights, widths
43
+ test_answers = []
44
+ for sample in tqdm(new_test):
45
+ record_id = sample["record_id"]
46
+ mask = np.load(raw / "train" / record_id / "human_pixel_masks.npy")
47
+ rle = rle_encode(mask)
48
+ rle = " ".join([str(i) for i in rle]) if rle else "-"
49
+
50
+ band = np.load(raw / "train" / record_id / "band_08.npy")
51
+ height, width, _ = band.shape
52
+ test_answers.append(
53
+ {
54
+ "record_id": record_id,
55
+ "encoded_pixels": rle,
56
+ "height": height,
57
+ "width": width,
58
+ }
59
+ )
60
+ test_answers = pd.DataFrame(test_answers)
61
+
62
+ for sample in tqdm(new_train):
63
+ record_id = sample["record_id"]
64
+ (public / "train" / record_id).mkdir(exist_ok=True, parents=True)
65
+ # Copy bands
66
+ for band_idx in range(8, 17):
67
+ file_name = f"band_{band_idx:02}.npy"
68
+ shutil.copyfile(
69
+ src=raw / "train" / record_id / file_name,
70
+ dst=public / "train" / record_id / file_name,
71
+ )
72
+ # Copy human individual masks
73
+ shutil.copyfile(
74
+ src=raw / "train" / record_id / "human_individual_masks.npy",
75
+ dst=public / "train" / record_id / "human_individual_masks.npy",
76
+ )
77
+ # Copy human pixel masks
78
+ shutil.copyfile(
79
+ src=raw / "train" / record_id / "human_pixel_masks.npy",
80
+ dst=public / "train" / record_id / "human_pixel_masks.npy",
81
+ )
82
+
83
+ for sample in tqdm(new_test):
84
+ record_id = sample["record_id"]
85
+ (public / "test" / record_id).mkdir(exist_ok=True, parents=True)
86
+ # Copy bands
87
+ for band_idx in range(8, 17):
88
+ file_name = f"band_{band_idx:02}.npy"
89
+ shutil.copyfile(
90
+ src=raw / "train" / record_id / file_name,
91
+ dst=public / "test" / record_id / file_name,
92
+ )
93
+
94
+ # Copy over existing validation data
95
+ (raw / "validation").mkdir(exist_ok=True, parents=True)
96
+ shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
97
+ shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")
98
+
99
+ # Write other files
100
+ with open(public / "train_metadata.json", "w") as f:
101
+ f.write(json.dumps(new_train))
102
+ test_answers.to_csv(private / "answers.csv", index=False)
103
+
104
+ submission_df = test_answers.copy()
105
+ random.seed(0)
106
+ submission_df["encoded_pixels"] = [
107
+ random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
108
+ ]
109
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
110
+
111
+ # Sanity checks
112
+ assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
113
+ assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
114
+ assert (
115
+ public / "validation_metadata.json"
116
+ ).exists(), "`validation_metadata.json` doesn't exist!"
117
+ assert (public / "train").exists(), "`train` directory doesn't exist!"
118
+ assert (public / "test").exists(), "`test` directory doesn't exist!"
119
+ assert (public / "validation").exists(), "`public` directory doesn't exist!"
120
+ assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"
121
+
122
+ new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
123
+ assert (
124
+ len(new_train_bands) == len(new_train) * 9
125
+ ), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
126
+ new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
127
+ assert (
128
+ len(new_test_bands) == len(new_test) * 9
129
+ ), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"
130
+
131
+ new_train_individual_masks = list(
132
+ img.stem for img in (public / "train").rglob("human_individual_masks.npy")
133
+ )
134
+ assert len(new_train_individual_masks) == len(
135
+ new_train
136
+ ), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
137
+ new_test_individual_masks = list(
138
+ img.stem for img in (public / "test").rglob("human_individual_masks.npy")
139
+ )
140
+ assert (
141
+ len(new_test_individual_masks) == 0
142
+ ), f"Expected 0 human individual masks per sample in the test set, but got {len(new_test_individual_masks)}!"
143
+
144
+ new_train_pixel_masks = list(
145
+ img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
146
+ )
147
+ assert len(new_train_pixel_masks) == len(
148
+ new_train
149
+ ), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
150
+ new_test_pixel_masks = list(
151
+ img.stem for img in (public / "test").rglob("human_pixel_masks.npy")
152
+ )
153
+ assert (
154
+ len(new_test_pixel_masks) == 0
155
+ ), f"Expected 0 human pixel masks per sample in the test set, but got {len(new_test_pixel_masks)}!"
@@ -0,0 +1,211 @@
1
+ import json
2
+ import random
3
+ import shutil
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from tqdm import tqdm
10
+
11
+ from mlebench.competitions.utils import get_logger, rle_encode
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def _create_answers_df(samples: list, raw_path: Path) -> pd.DataFrame:
17
+ """Creates a DataFrame with ground truth answers for a given set of samples."""
18
+ answers = []
19
+ for sample in tqdm(samples, desc="Creating answers CSV"):
20
+ record_id = sample["record_id"]
21
+ mask = np.load(raw_path / "train" / record_id / "human_pixel_masks.npy")
22
+ rle = rle_encode(mask)
23
+ rle = " ".join([str(i) for i in rle]) if rle else "-"
24
+
25
+ band = np.load(raw_path / "train" / record_id / "band_08.npy")
26
+ height, width, _ = band.shape
27
+ answers.append(
28
+ {
29
+ "record_id": record_id,
30
+ "encoded_pixels": rle,
31
+ "height": height,
32
+ "width": width,
33
+ }
34
+ )
35
+ return pd.DataFrame(answers)
36
+
37
+
38
+ def _copy_data_files(samples: list, raw_path: Path, dest_path: Path, include_masks: bool):
39
+ """Copies data files (bands and optionally masks) for a given set of samples."""
40
+ desc = f"Copying {'train' if include_masks else 'test'} files"
41
+ for sample in tqdm(samples, desc=desc):
42
+ record_id = sample["record_id"]
43
+ (dest_path / record_id).mkdir(exist_ok=True, parents=True)
44
+ # Copy bands
45
+ for band_idx in range(8, 17):
46
+ file_name = f"band_{band_idx:02}.npy"
47
+ shutil.copyfile(
48
+ src=raw_path / "train" / record_id / file_name,
49
+ dst=dest_path / record_id / file_name,
50
+ )
51
+ if include_masks:
52
+ # Copy human individual masks
53
+ shutil.copyfile(
54
+ src=raw_path / "train" / record_id / "human_individual_masks.npy",
55
+ dst=dest_path / record_id / "human_individual_masks.npy",
56
+ )
57
+ # Copy human pixel masks
58
+ shutil.copyfile(
59
+ src=raw_path / "train" / record_id / "human_pixel_masks.npy",
60
+ dst=dest_path / record_id / "human_pixel_masks.npy",
61
+ )
62
+
63
+
64
+ def prepare(raw: Path, public: Path, private: Path):
65
+ """
66
+ We make train/test split from old train set, using same train/test proportion as the original
67
+ competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
68
+ copy over the validation set as-is.
69
+
70
+ `sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
71
+
72
+ A second split is performed on the train set to create a new validation set in parallel
73
+ `public_val` and `private_val` directories.
74
+ """
75
+ DEV = False
76
+
77
+ with open(raw / "train_metadata.json", "r") as f:
78
+ train_metadata = json.load(f)
79
+ train_metadata = train_metadata[:100] if DEV else train_metadata
80
+ with open(raw / "validation_metadata.json", "r") as f:
81
+ validation_metadata = json.load(f)
82
+
83
+ # ==================================================================================
84
+ # 1. Original Split: Create `new_train` and `new_test`
85
+ # ==================================================================================
86
+ if DEV:
87
+ new_train, new_test = train_metadata[:90], train_metadata[90:]
88
+ else:
89
+ new_train, new_test = train_test_split(
90
+ train_metadata, test_size=len(validation_metadata), random_state=0
91
+ )
92
+ logger.info(
93
+ f"Created original split with {len(new_train)} train samples and {len(new_test)} test samples"
94
+ )
95
+
96
+ # ==================================================================================
97
+ # 2. Process and Save Original Split to `public` and `private`
98
+ # ==================================================================================
99
+ public.mkdir(exist_ok=True)
100
+ private.mkdir(exist_ok=True)
101
+
102
+ # Copy train and test files
103
+ _copy_data_files(new_train, raw, public / "train", include_masks=True)
104
+ _copy_data_files(new_test, raw, public / "test", include_masks=False)
105
+
106
+ # Create and save ground truth answers for the test set
107
+ test_answers = _create_answers_df(new_test, raw)
108
+ test_answers.to_csv(private / "answers.csv", index=False)
109
+
110
+ # Save train metadata
111
+ with open(public / "train_metadata.json", "w") as f:
112
+ f.write(json.dumps(new_train))
113
+
114
+ # Create and save a sample submission
115
+ submission_df = test_answers.copy()
116
+ random.seed(0)
117
+ submission_df["encoded_pixels"] = [
118
+ random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
119
+ ]
120
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
121
+
122
+ # Copy over existing validation data (this is unique to the original set)
123
+ (raw / "validation").mkdir(exist_ok=True, parents=True)
124
+ shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
125
+ shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")
126
+
127
+ # ==================================================================================
128
+ # 3. New Validation Split: Split `new_train` into `train_val` and `test_val`
129
+ # ==================================================================================
130
+ train_val, test_val = train_test_split(
131
+ new_train, test_size=len(new_test), random_state=0
132
+ )
133
+ logger.info(
134
+ f"Created validation split with {len(train_val)} train_val samples and {len(test_val)} test_val samples"
135
+ )
136
+
137
+ # ==================================================================================
138
+ # 4. Process and Save Validation Split to `public_val` and `private_val`
139
+ # ==================================================================================
140
+ public_val = public.parent / "public_val"
141
+ private_val = private.parent / "private_val"
142
+ public_val.mkdir(exist_ok=True)
143
+ private_val.mkdir(exist_ok=True)
144
+
145
+ # Copy train_val and test_val files
146
+ _copy_data_files(train_val, raw, public_val / "train", include_masks=True)
147
+ _copy_data_files(test_val, raw, public_val / "test", include_masks=False)
148
+
149
+ # Create and save ground truth answers for the test_val set
150
+ test_val_answers = _create_answers_df(test_val, raw)
151
+ # The filename must be "answers.csv" to mirror the private directory structure
152
+ test_val_answers.to_csv(private_val / "answers.csv", index=False)
153
+
154
+ # Save train_val metadata
155
+ # The filename must be "train_metadata.json" to mirror the public directory structure
156
+ with open(public_val / "train_metadata.json", "w") as f:
157
+ f.write(json.dumps(train_val))
158
+
159
+ # Create and save a sample submission for the validation set
160
+ submission_val_df = test_val_answers.copy()
161
+ random.seed(0)
162
+ submission_val_df["encoded_pixels"] = [
163
+ random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_val_df))
164
+ ]
165
+ # The filename must be "sample_submission.csv" to mirror the public directory structure
166
+ submission_val_df.to_csv(public_val / "sample_submission.csv", index=False)
167
+
168
+ # ==================================================================================
169
+ # 5. Sanity Checks
170
+ # ==================================================================================
171
+ logger.info("Performing sanity checks for original directories...")
172
+ # Sanity checks for original directories
173
+ assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
174
+ assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
175
+ assert (
176
+ public / "validation_metadata.json"
177
+ ).exists(), "`validation_metadata.json` doesn't exist!"
178
+ assert (public / "train").exists(), "`train` directory doesn't exist!"
179
+ assert (public / "test").exists(), "`test` directory doesn't exist!"
180
+ assert (public / "validation").exists(), "`public` directory doesn't exist!"
181
+ assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"
182
+
183
+ new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
184
+ assert (
185
+ len(new_train_bands) == len(new_train) * 9
186
+ ), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
187
+ new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
188
+ assert (
189
+ len(new_test_bands) == len(new_test) * 9
190
+ ), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"
191
+
192
+ new_train_individual_masks = list(
193
+ img.stem for img in (public / "train").rglob("human_individual_masks.npy")
194
+ )
195
+ assert len(new_train_individual_masks) == len(
196
+ new_train
197
+ ), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
198
+ new_train_pixel_masks = list(
199
+ img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
200
+ )
201
+ assert len(new_train_pixel_masks) == len(
202
+ new_train
203
+ ), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
204
+
205
+ logger.info("Performing sanity checks for validation directories...")
206
+ # Sanity checks for new validation directories
207
+ assert (public_val / "train_metadata.json").exists(), "`public_val/train_metadata.json` doesn't exist!"
208
+ assert (public_val / "sample_submission.csv").exists(), "`public_val/sample_submission.csv` doesn't exist!"
209
+ assert (public_val / "train").exists(), "`public_val/train` directory doesn't exist!"
210
+ assert (public_val / "test").exists(), "`public_val/test` directory doesn't exist!"
211
+ assert (private_val / "answers.csv").exists(), "`private_val/answers.csv` doesn't exist!"
@@ -0,0 +1,42 @@
1
+ import pandas as pd
2
+
3
+ from mlebench.grade_helpers import InvalidSubmissionError
4
+ from mlebench.metrics import mean_average_precision_at_k
5
+
6
+
7
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
8
+ # Answers checks
9
+ assert set(answers.columns) == {
10
+ "customer_id",
11
+ "prediction",
12
+ }, "Answers DataFrame must have 'customer_id' and 'prediction' columns."
13
+
14
+ # Submission checks
15
+ if set(submission.columns) != {"customer_id", "prediction"}:
16
+ raise InvalidSubmissionError(
17
+ "Submission DataFrame must have 'customer_id' and 'prediction' columns."
18
+ )
19
+ if not (set(submission["customer_id"]) >= set(answers["customer_id"])):
20
+ raise InvalidSubmissionError(
21
+ "Submission customer_id must be a superset of answers customer_id"
22
+ )
23
+
24
+ # Filter the submission to only consider the customer_ids that exist in answers
25
+ submission = submission[submission["customer_id"].isin(answers["customer_id"])]
26
+
27
+ # Sort to ensure we're grading the right rows
28
+ submission = submission.sort_values("customer_id").reset_index(drop=True)
29
+ answers = answers.sort_values("customer_id").reset_index(drop=True)
30
+
31
+ y_true = answers["prediction"].astype(str).str.split(" ").apply(set).tolist()
32
+ y_pred = submission["prediction"].astype(str).str.split(" ").tolist()
33
+
34
+ return {"actual": y_true, "predicted": y_pred}
35
+
36
+
37
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
38
+ # Prepare the data for metric calculation
39
+ prepped = prepare_for_metric(submission, answers)
40
+ return mean_average_precision_at_k(
41
+ actual=prepped["actual"], predicted=prepped["predicted"], k=12
42
+ )
@@ -0,0 +1,102 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+ """
12
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
13
+ """
14
+
15
+ # Create train, test from train split
16
+ old_train = read_csv(raw / "transactions_train.csv")
17
+ old_train["purchase_id"] = (
18
+ old_train["customer_id"].astype(str)
19
+ + "_"
20
+ + old_train["article_id"].astype(str)
21
+ + "_"
22
+ + old_train["t_dat"].astype(str)
23
+ )
24
+
25
+ # The task is to predict what purchases will be made in the next 7 days.
26
+ # To create our test set, we will take the purchases made in the last 7 days of the training set.
27
+ old_train["t_dat_parsed"] = pd.to_datetime(
28
+ old_train["t_dat"]
29
+ ) # Parse t_dat to datetime in a new column
30
+ max_date = old_train["t_dat_parsed"].max() # Find the maximum date in the t_dat_parsed column
31
+ old_train["in_last_7_days"] = old_train["t_dat_parsed"] >= (max_date - pd.Timedelta(days=7))
32
+ new_train = old_train[
33
+ old_train["in_last_7_days"] == False
34
+ ].copy() # Filter rows where t_dat_parsed is more than 7 days from the maximum date
35
+ new_test = old_train[
36
+ old_train["in_last_7_days"] == True
37
+ ].copy() # Filter rows where t_dat_parsed is within the last 7 days of the time series
38
+
39
+ # Train/test checks
40
+ assert (
41
+ not new_test["purchase_id"].isin(new_train["purchase_id"]).any()
42
+ ), "No purchase_ids should be shared between new_test and new_train"
43
+ new_train = new_train.drop(columns=["purchase_id", "t_dat_parsed", "in_last_7_days"])
44
+ new_test = new_test.drop(columns=["purchase_id", "t_dat_parsed"])
45
+
46
+ # sample submission and answers differ because the task is predicting what articles each
47
+ # customer will purchase in the 7-day period immediately after the training data ends. Customer
48
+ # who did not make any purchase during that time are excluded from the scoring.
49
+
50
+ # As such we can't put the exact customer ids from test set into the sample submission, as this
51
+ # would leak which customers made purchases in the test set. Instead, we put _all_ the customer
52
+ # ids in the sample submission, ask the user to predict for all of them, and then we will filter
53
+ # out in grade.py the customers who did not make any purchases in the test set.
54
+
55
+ # Answers, contains only customers that actually made purchases in the test period.
56
+ answers = (
57
+ new_test.groupby("customer_id")["article_id"]
58
+ .apply(lambda x: " ".join(x.astype(str)))
59
+ .reset_index()
60
+ )
61
+ # rename 'article_id' to 'prediction'
62
+ answers = answers.rename(columns={"article_id": "prediction"})
63
+
64
+ # Sample submission, which contains all customer ids.
65
+ shutil.copyfile(
66
+ src=raw / "sample_submission.csv",
67
+ dst=public / "sample_submission.csv",
68
+ )
69
+
70
+ # Write CSVs
71
+ # new_test.to_csv(private / "test.csv", index=False)
72
+ answers.to_csv(private / "answers.csv", index=False)
73
+ new_train.to_csv(public / "transactions_train.csv", index=False)
74
+
75
+ # Copy files and images directory
76
+ shutil.copyfile(
77
+ src=raw / "articles.csv",
78
+ dst=public / "articles.csv",
79
+ )
80
+ shutil.copyfile(
81
+ src=raw / "customers.csv",
82
+ dst=public / "customers.csv",
83
+ )
84
+ shutil.copytree(
85
+ src=raw / "images",
86
+ dst=public / "images",
87
+ dirs_exist_ok=True,
88
+ )
89
+
90
+ # checks
91
+ expected_train_columns = ["t_dat", "customer_id", "article_id", "price", "sales_channel_id"]
92
+ assert (
93
+ new_train.columns.tolist() == expected_train_columns
94
+ ), f"Unexcpected columns in new_train, expected {expected_train_columns}, got {new_train.columns.tolist()}"
95
+
96
+ expected_answer_columns = ["customer_id", "prediction"]
97
+ assert (
98
+ answers.columns.tolist() == expected_answer_columns
99
+ ), f"Unexcpected columns in answers, expected {expected_answer_columns}, got {answers.columns.tolist()}"
100
+ assert answers["customer_id"].nunique() == len(
101
+ answers
102
+ ), "There should be no duplicate customer_ids in answers"
@@ -0,0 +1,132 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def _split_and_process_data(transactions_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
11
+ """
12
+ Splits a dataframe into train and test sets based on the last 7 days of transactions.
13
+
14
+ Args:
15
+ transactions_df: The input dataframe with transaction data.
16
+
17
+ Returns:
18
+ A tuple containing:
19
+ - new_train_df: The training data (all data except the last 7 days).
20
+ - answers_df: The ground truth answers for the test set (last 7 days).
21
+ """
22
+ df = transactions_df.copy()
23
+ if "purchase_id" not in df.columns:
24
+ df["purchase_id"] = (
25
+ df["customer_id"].astype(str)
26
+ + "_"
27
+ + df["article_id"].astype(str)
28
+ + "_"
29
+ + df["t_dat"].astype(str)
30
+ )
31
+
32
+ # The task is to predict what purchases will be made in the next 7 days.
33
+ # To create our test set, we will take the purchases made in the last 7 days of the training set.
34
+ df["t_dat_parsed"] = pd.to_datetime(df["t_dat"]) # Parse t_dat to datetime in a new column
35
+ max_date = df["t_dat_parsed"].max() # Find the maximum date in the t_dat_parsed column
36
+ df["in_last_7_days"] = df["t_dat_parsed"] >= (max_date - pd.Timedelta(days=7))
37
+
38
+ new_train_df = df[df["in_last_7_days"] == False].copy()
39
+ new_test_df = df[df["in_last_7_days"] == True].copy()
40
+
41
+ # Train/test checks
42
+ assert (
43
+ not new_test_df["purchase_id"].isin(new_train_df["purchase_id"]).any()
44
+ ), "No purchase_ids should be shared between a test and train split"
45
+
46
+ new_train_df = new_train_df.drop(columns=["purchase_id", "t_dat_parsed", "in_last_7_days"])
47
+
48
+ # Answers, contains only customers that actually made purchases in the test period.
49
+ answers_df = (
50
+ new_test_df.groupby("customer_id")["article_id"]
51
+ .apply(lambda x: " ".join(x.astype(str)))
52
+ .reset_index()
53
+ )
54
+ # rename 'article_id' to 'prediction'
55
+ answers_df = answers_df.rename(columns={"article_id": "prediction"})
56
+
57
+ return new_train_df, answers_df
58
+
59
+
60
+ def _copy_static_files(raw_path: Path, public_path: Path):
61
+ """Copies static competition files (articles, customers, images) to a public directory."""
62
+ # Sample submission, which contains all customer ids.
63
+ shutil.copyfile(
64
+ src=raw_path / "sample_submission.csv",
65
+ dst=public_path / "sample_submission.csv",
66
+ )
67
+ # Copy files and images directory
68
+ shutil.copyfile(
69
+ src=raw_path / "articles.csv",
70
+ dst=public_path / "articles.csv",
71
+ )
72
+ shutil.copyfile(
73
+ src=raw_path / "customers.csv",
74
+ dst=public_path / "customers.csv",
75
+ )
76
+ shutil.copytree(
77
+ src=raw_path / "images",
78
+ dst=public_path / "images",
79
+ dirs_exist_ok=True,
80
+ )
81
+
82
+
83
+ def _run_output_checks(train_df: pd.DataFrame, answers_df: pd.DataFrame):
84
+ """Runs assertions to check the format of final output dataframes."""
85
+ expected_train_columns = ["t_dat", "customer_id", "article_id", "price", "sales_channel_id"]
86
+ assert (
87
+ train_df.columns.tolist() == expected_train_columns
88
+ ), f"Unexcpected columns in new_train, expected {expected_train_columns}, got {train_df.columns.tolist()}"
89
+
90
+ expected_answer_columns = ["customer_id", "prediction"]
91
+ assert (
92
+ answers_df.columns.tolist() == expected_answer_columns
93
+ ), f"Unexcpected columns in answers, expected {expected_answer_columns}, got {answers_df.columns.tolist()}"
94
+ assert answers_df["customer_id"].nunique() == len(
95
+ answers_df
96
+ ), "There should be no duplicate customer_ids in answers"
97
+
98
+
99
+ def prepare(raw: Path, public: Path, private: Path):
100
+ """
101
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
102
+ Also creates a second, parallel validation split (in public_val/private_val).
103
+ """
104
+ # Create train, test from train split
105
+ raw_transactions = read_csv(raw / "transactions_train.csv")
106
+
107
+ # --- Original Data Split (Train/Test) ---
108
+ # This split generates the main competition files.
109
+ train_orig, answers_orig = _split_and_process_data(raw_transactions)
110
+
111
+ # Write original public and private files
112
+ answers_orig.to_csv(private / "answers.csv", index=False)
113
+ train_orig.to_csv(public / "transactions_train.csv", index=False)
114
+ _copy_static_files(raw, public)
115
+ _run_output_checks(train_orig, answers_orig)
116
+
117
+ # --- New Validation Data Split (Train/Val) ---
118
+ # Define and create the new directories for the validation set.
119
+ public_val = public.parent / "public_val"
120
+ private_val = private.parent / "private_val"
121
+ public_val.mkdir(parents=True, exist_ok=True)
122
+ private_val.mkdir(parents=True, exist_ok=True)
123
+
124
+ # This second split takes the *output training set* from the first split
125
+ # and splits it again to create a new, smaller training set and a validation set.
126
+ train_val, answers_val = _split_and_process_data(train_orig)
127
+
128
+ # Write validation set files to the new directories, mirroring the original structure.
129
+ answers_val.to_csv(private_val / "answers.csv", index=False)
130
+ train_val.to_csv(public_val / "transactions_train.csv", index=False)
131
+ _copy_static_files(raw, public_val)
132
+ _run_output_checks(train_val, answers_val)
@@ -0,0 +1,23 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.metrics import accuracy_score
4
+
5
+
6
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
7
+ """
8
+ Grade the submission using accuracy metric.
9
+
10
+ Args:
11
+ submission: DataFrame with columns ['id', 'label']
12
+ answers: DataFrame with columns ['id', 'label']
13
+
14
+ Returns:
15
+ Accuracy score (float between 0 and 1)
16
+ """
17
+ # Merge on id to ensure alignment
18
+ merged = pd.merge(answers, submission, on='id', suffixes=('_true', '_pred'))
19
+
20
+ # Calculate accuracy
21
+ accuracy = accuracy_score(merged['label_true'], merged['label_pred'])
22
+
23
+ return accuracy