dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,27 @@
1
+ from pathlib import Path
2
+ import pandas as pd
3
+
4
+
5
+ def prepare(raw: Path, public: Path, private: Path):
6
+ """
7
+ Prepare demand-forecasting-kernels-only dataset.
8
+ """
9
+ # Read data
10
+ train = pd.read_csv(raw / "train.csv")
11
+ test = pd.read_csv(raw / "test.csv")
12
+ sample_submission = pd.read_csv(raw / "sample_submission.csv")
13
+ test_answer = pd.read_csv(raw / "test_answer.csv")
14
+
15
+ # Public files (visible to agents)
16
+ train.to_csv(public / "train.csv", index=False)
17
+ test.to_csv(public / "test.csv", index=False)
18
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
19
+
20
+ # Private files (for grading)
21
+ test_answer.to_csv(private / "test.csv", index=False)
22
+
23
+ # Validation checks
24
+ assert len(test_answer) == len(sample_submission), \
25
+ f"Test answer ({len(test_answer)}) and sample submission ({len(sample_submission)}) must have same length"
26
+ assert "id" in test_answer.columns, "Test answer must have 'id' column"
27
+ assert "sales" in test_answer.columns, "Test answer must have 'sales' column"
@@ -0,0 +1,44 @@
1
+ from pandas import DataFrame
2
+ from sklearn.metrics import root_mean_squared_error
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+
6
+
7
+ def grade(submission: DataFrame, answers: DataFrame) -> float:
8
+ assert "id" in answers.columns, "Answers must have an 'id' column."
9
+ assert "value" in answers.columns, "Answers must have a 'value' column."
10
+
11
+ try:
12
+ answers["value"].astype(float)
13
+ except ValueError as e:
14
+ assert False, "Answers must have float values in the 'value' column."
15
+
16
+ if "id" not in submission.columns:
17
+ raise InvalidSubmissionError("Submission must have an 'id' column.")
18
+
19
+ if "value" not in submission.columns:
20
+ raise InvalidSubmissionError("Submission must have a 'value' column.")
21
+
22
+ try:
23
+ submission["value"].astype(float)
24
+ except ValueError as e:
25
+ raise InvalidSubmissionError(f"Submission must have float values in the 'value' column.")
26
+
27
+ if len(submission) != len(answers):
28
+ raise InvalidSubmissionError(
29
+ f"Expected the submission to have {len(answers)} rows, but got {len(submission)}."
30
+ )
31
+
32
+ submission_sorted = submission.sort_values(by="id").sort_index(axis=1)
33
+ answers_sorted = answers.sort_values(by="id").sort_index(axis=1)
34
+
35
+ if (submission_sorted["id"].values != answers_sorted["id"].values).any():
36
+ raise InvalidSubmissionError(
37
+ "Expected the submission to have the same 'id' values as the answers, but they differ."
38
+ )
39
+
40
+ y_true = submission_sorted["value"]
41
+ y_pred = answers_sorted["value"]
42
+ score = root_mean_squared_error(y_true=y_true, y_pred=y_pred)
43
+
44
+ return score
@@ -0,0 +1,134 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pandas import DataFrame, read_csv
7
+ from PIL import Image
8
+ from sklearn.model_selection import train_test_split
9
+
10
+ from mlebench.utils import extract
11
+
12
+
13
+ def prepare(raw: Path, public: Path, private: Path) -> None:
14
+ # Create new train and test splits
15
+ extract(raw / "train.zip", public)
16
+
17
+ # Original test ratio has Train set - 144 samples; Test set - 72 samples (33% ratio)
18
+ # We set new test ratio to 0.2 to keep it number of train samples at same OOM while having enough samples for new test
19
+ old_train_img_fpaths = sorted((public / "train").glob("*.png"))
20
+ new_train_img_fpaths, new_test_img_fpaths = train_test_split(
21
+ old_train_img_fpaths,
22
+ test_size=0.2,
23
+ random_state=0,
24
+ )
25
+
26
+ new_test_imgs = set([fpath.name for fpath in new_test_img_fpaths])
27
+
28
+ # Create `test` directory
29
+ (public / "test").mkdir(exist_ok=True, parents=True)
30
+
31
+ for fpath in (public / "train").glob("*.png"):
32
+ if fpath.name in new_test_imgs:
33
+ shutil.move(fpath, public / "test" / fpath.name)
34
+
35
+ # Create public `train_cleaned` directories
36
+ extract(raw / "train_cleaned.zip", public)
37
+
38
+ (private / "train_cleaned").mkdir(exist_ok=True, parents=True)
39
+
40
+ for fpath in (public / "train_cleaned").glob("*.png"):
41
+ if fpath.name in new_test_imgs:
42
+ shutil.move(fpath, private / "train_cleaned" / fpath.name)
43
+
44
+ # Write `answers.csv`
45
+ dfs = []
46
+
47
+ for fpath in sorted((private / "train_cleaned").glob("*.png")):
48
+ df = to_df(fpath)
49
+ dfs.append(df)
50
+
51
+ answers = DataFrame(pd.concat(dfs))
52
+ answers.to_csv(private / "answers.csv", index=False)
53
+
54
+ shutil.rmtree(private / "train_cleaned")
55
+
56
+ # Write `sampleSubmission.csv`
57
+ sample_submission = answers.copy()
58
+ sample_submission["value"] = 1
59
+ sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
60
+
61
+ # Sanity checks
62
+ assert len(list((public / "train").glob("*.png"))) == len(old_train_img_fpaths) - len(
63
+ new_test_img_fpaths
64
+ ), (
65
+ f"Expected the number of images in `train` to be "
66
+ f"{len(old_train_img_fpaths) - len(new_test_img_fpaths)}, but got "
67
+ f"{len(list((public / 'train').glob('*.png')))}."
68
+ )
69
+
70
+ assert len(list((public / "test").glob("*.png"))) == len(new_test_img_fpaths), (
71
+ f"Expected {len(new_test_img_fpaths)} in the `test` directory, but got "
72
+ f"{len(list((public / 'test').glob('*.png')))}."
73
+ )
74
+
75
+ assert read_csv(private / "answers.csv").drop_duplicates().shape[0] == len(
76
+ read_csv(private / "answers.csv")
77
+ ), (
78
+ f"Expected `answers.csv` to have unique rows, but got "
79
+ f"{read_csv(private / 'answers.csv').drop_duplicates().shape[0]} unique rows and "
80
+ f"{len(read_csv(private / 'answers.csv'))} rows in total."
81
+ )
82
+
83
+ assert len(read_csv(private / "answers.csv")) == len(
84
+ read_csv(public / "sampleSubmission.csv")
85
+ ), (
86
+ f"Expected `answers.csv` and `sampleSubmission.csv` to have the same number of rows, but "
87
+ f"got {len(read_csv(private / 'answers.csv'))} rows in `answers.csv` and "
88
+ f"{len(read_csv(public / 'sampleSubmission.csv'))} rows in `sampleSubmission.csv`."
89
+ )
90
+
91
+ assert "id" in read_csv(private / "answers.csv").columns, (
92
+ f"Expected `answers.csv` to have an 'id' column, but got "
93
+ f"{read_csv(private / 'answers.csv').columns}."
94
+ )
95
+
96
+ assert "value" in read_csv(private / "answers.csv").columns, (
97
+ f"Expected `answers.csv` to have a 'value' column, but got "
98
+ f"{read_csv(private / 'answers.csv').columns}."
99
+ )
100
+
101
+ assert "id" in read_csv(public / "sampleSubmission.csv").columns, (
102
+ f"Expected `sampleSubmission.csv` to have an 'id' column, but got "
103
+ f"{read_csv(public / 'sampleSubmission.csv').columns}."
104
+ )
105
+
106
+ assert "value" in read_csv(public / "sampleSubmission.csv").columns, (
107
+ f"Expected `sampleSubmission.csv` to have a 'value' column, but got "
108
+ f"{read_csv(public / 'sampleSubmission.csv').columns}."
109
+ )
110
+
111
+
112
+ def to_df(img: Path) -> DataFrame:
113
+ """Converts an image to a DataFrame, where each row corresponds to a pixel."""
114
+
115
+ image = Image.open(img).convert("L")
116
+ image_array = np.array(image) / 255.0
117
+
118
+ rows, cols = image_array.shape
119
+ data = {"id": [], "value": []}
120
+
121
+ for row in range(rows):
122
+ for col in range(cols):
123
+ pixel_id = f"{img.stem}_{row+1}_{col+1}"
124
+ pixel_value = image_array[row, col]
125
+ data["id"].append(pixel_id)
126
+ data["value"].append(pixel_value)
127
+
128
+ df = DataFrame(data)
129
+
130
+ assert (
131
+ len(df) == rows * cols
132
+ ), f"Expected the DataFrame to have {rows * cols} rows, but got {len(df)} rows."
133
+
134
+ return DataFrame(data)
@@ -0,0 +1,178 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from pandas import DataFrame, read_csv
7
+ from PIL import Image
8
+ from sklearn.model_selection import train_test_split
9
+
10
+ from mlebench.utils import extract
11
+
12
+
13
+ def _generate_split_artifacts(
14
+ train_fpaths: list[Path],
15
+ test_fpaths: list[Path],
16
+ source_data_dir: Path,
17
+ public_dest: Path,
18
+ private_dest: Path,
19
+ ) -> None:
20
+ """
21
+ Helper function to generate all required files for a given train/test split.
22
+
23
+ This function copies the necessary image files, creates the ground-truth
24
+ `answers.csv` for the test set, and a `sampleSubmission.csv` for the public.
25
+ """
26
+ public_dest.mkdir(exist_ok=True, parents=True)
27
+ private_dest.mkdir(exist_ok=True, parents=True)
28
+
29
+ test_img_names = {fpath.name for fpath in test_fpaths}
30
+
31
+ # Create public train/test directories and copy images
32
+ (public_dest / "train").mkdir(exist_ok=True)
33
+ (public_dest / "test").mkdir(exist_ok=True)
34
+
35
+ for fpath in train_fpaths:
36
+ shutil.copy(fpath, public_dest / "train" / fpath.name)
37
+
38
+ for fpath in test_fpaths:
39
+ shutil.copy(fpath, public_dest / "test" / fpath.name)
40
+
41
+ # Prepare ground-truth labels for the test set
42
+ temp_private_cleaned_dir = private_dest / "train_cleaned"
43
+ temp_private_cleaned_dir.mkdir(exist_ok=True)
44
+
45
+ source_cleaned_dir = source_data_dir / "train_cleaned"
46
+ for fpath in source_cleaned_dir.glob("*.png"):
47
+ if fpath.name in test_img_names:
48
+ shutil.copy(fpath, temp_private_cleaned_dir / fpath.name)
49
+
50
+ # Write `answers.csv` for the test set
51
+ dfs = []
52
+ for fpath in sorted(temp_private_cleaned_dir.glob("*.png")):
53
+ df = to_df(fpath)
54
+ dfs.append(df)
55
+
56
+ answers = DataFrame(pd.concat(dfs))
57
+ answers.to_csv(private_dest / "answers.csv", index=False)
58
+
59
+ shutil.rmtree(temp_private_cleaned_dir)
60
+
61
+ # Write `sampleSubmission.csv`
62
+ sample_submission = answers.copy()
63
+ sample_submission["value"] = 1
64
+ sample_submission.to_csv(public_dest / "sampleSubmission.csv", index=False)
65
+
66
+ # Sanity checks
67
+ assert len(list((public_dest / "train").glob("*.png"))) == len(train_fpaths), (
68
+ f"Expected the number of images in `{public_dest}/train` to be "
69
+ f"{len(train_fpaths)}, but got "
70
+ f"{len(list((public_dest / 'train').glob('*.png')))}."
71
+ )
72
+
73
+ assert len(list((public_dest / "test").glob("*.png"))) == len(test_fpaths), (
74
+ f"Expected {len(test_fpaths)} in the `{public_dest}/test` directory, but got "
75
+ f"{len(list((public_dest / 'test').glob('*.png')))}."
76
+ )
77
+
78
+ assert read_csv(private_dest / "answers.csv").drop_duplicates().shape[0] == len(
79
+ read_csv(private_dest / "answers.csv")
80
+ ), (
81
+ f"Expected `{private_dest}/answers.csv` to have unique rows, but got "
82
+ f"{read_csv(private_dest / 'answers.csv').drop_duplicates().shape[0]} unique rows and "
83
+ f"{len(read_csv(private_dest / 'answers.csv'))} rows in total."
84
+ )
85
+
86
+ assert len(read_csv(private_dest / "answers.csv")) == len(
87
+ read_csv(public_dest / "sampleSubmission.csv")
88
+ ), (
89
+ f"Expected `answers.csv` and `sampleSubmission.csv` to have the same number of rows, but "
90
+ f"got {len(read_csv(private_dest / 'answers.csv'))} rows in `answers.csv` and "
91
+ f"{len(read_csv(public_dest / 'sampleSubmission.csv'))} rows in `sampleSubmission.csv`."
92
+ )
93
+
94
+ assert "id" in read_csv(private_dest / "answers.csv").columns, (
95
+ f"Expected `answers.csv` to have an 'id' column, but got "
96
+ f"{read_csv(private_dest / 'answers.csv').columns}."
97
+ )
98
+
99
+ assert "value" in read_csv(private_dest / "answers.csv").columns, (
100
+ f"Expected `answers.csv` to have a 'value' column, but got "
101
+ f"{read_csv(private_dest / 'answers.csv').columns}."
102
+ )
103
+
104
+ assert "id" in read_csv(public_dest / "sampleSubmission.csv").columns, (
105
+ f"Expected `sampleSubmission.csv` to have an 'id' column, but got "
106
+ f"{read_csv(public_dest / 'sampleSubmission.csv').columns}."
107
+ )
108
+
109
+ assert "value" in read_csv(public_dest / "sampleSubmission.csv").columns, (
110
+ f"Expected `sampleSubmission.csv` to have a 'value' column, but got "
111
+ f"{read_csv(public_dest / 'sampleSubmission.csv').columns}."
112
+ )
113
+
114
+
115
+ def prepare(raw: Path, public: Path, private: Path) -> None:
116
+ # Define paths for the new validation split
117
+ public_val = public.parent / "public_val"
118
+ private_val = private.parent / "private_val"
119
+
120
+ # Use a temporary directory to extract raw data to avoid side-effects
121
+ temp_source_dir = public.parent / "temp_data_source"
122
+ if temp_source_dir.exists():
123
+ shutil.rmtree(temp_source_dir)
124
+ temp_source_dir.mkdir(parents=True)
125
+
126
+ # Extract all necessary data once
127
+ extract(raw / "train.zip", temp_source_dir)
128
+ extract(raw / "train_cleaned.zip", temp_source_dir)
129
+ all_img_fpaths = sorted((temp_source_dir / "train").glob("*.png"))
130
+
131
+ # --- First Split: The Original Train/Test Split ---
132
+ # We set new test ratio to 0.2 to keep it number of train samples at same OOM while having enough samples for new test
133
+ orig_train_fpaths, orig_test_fpaths = train_test_split(
134
+ all_img_fpaths,
135
+ test_size=0.2,
136
+ random_state=0,
137
+ )
138
+ # Generate artifacts for the original public/private directories
139
+ _generate_split_artifacts(orig_train_fpaths, orig_test_fpaths, temp_source_dir, public, private)
140
+
141
+ # --- Second Split: Create Train/Validation Split from the Original Train Set ---
142
+ # To keep the new validation set size similar to the original test set size,
143
+ # we use test_size=0.25 (since 0.25 * 0.8 = 0.2).
144
+ new_train_fpaths, val_fpaths = train_test_split(
145
+ orig_train_fpaths, # Split the original training data again
146
+ test_size=0.25,
147
+ random_state=0,
148
+ )
149
+ # Generate artifacts for the new validation directories (public_val/private_val)
150
+ _generate_split_artifacts(new_train_fpaths, val_fpaths, temp_source_dir, public_val, private_val)
151
+
152
+ # Clean up the temporary source directory
153
+ shutil.rmtree(temp_source_dir)
154
+
155
+
156
+ def to_df(img: Path) -> DataFrame:
157
+ """Converts an image to a DataFrame, where each row corresponds to a pixel."""
158
+
159
+ image = Image.open(img).convert("L")
160
+ image_array = np.array(image) / 255.0
161
+
162
+ rows, cols = image_array.shape
163
+ data = {"id": [], "value": []}
164
+
165
+ for row in range(rows):
166
+ for col in range(cols):
167
+ pixel_id = f"{img.stem}_{row+1}_{col+1}"
168
+ pixel_value = image_array[row, col]
169
+ data["id"].append(pixel_id)
170
+ data["value"].append(pixel_value)
171
+
172
+ df = DataFrame(data)
173
+
174
+ assert (
175
+ len(df) == rows * cols
176
+ ), f"Expected the DataFrame to have {rows * cols} rows, but got {len(df)} rows."
177
+
178
+ return DataFrame(data)
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import roc_auc_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_auroc_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ roc_auc_inputs = prepare_for_auroc_metric(
9
+ submission=submission, answers=answers, id_col="Comment", target_col="Insult"
10
+ )
11
+ return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])
@@ -0,0 +1,72 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from mlebench.utils import read_csv
5
+
6
+
7
+ def prepare(raw: Path, public: Path, private: Path):
8
+ # the test set labels are provided so we can just copy things without doing the split ourselves
9
+ shutil.copy(raw / "train.csv", public / "train.csv")
10
+ shutil.copy(raw / "test_with_solutions.csv", private / "test.csv")
11
+
12
+ # the public test currently online is for the verification stage, which we are ignoring
13
+ # (we are ignoring because there is some train/test leakage and unclear how this factors in leaderboard)
14
+ # we can recover the original public test set by removing the labels from the private test set
15
+ # can make the gold submission while we're at it
16
+ gold_submission = read_csv(private / "test.csv")
17
+ gold_submission = gold_submission[["Insult", "Date", "Comment"]]
18
+ gold_submission.to_csv(private / "gold_submission.csv", index=False)
19
+
20
+ public_test = gold_submission.drop(columns=["Insult"]).copy()
21
+ public_test.to_csv(public / "test.csv", index=False)
22
+
23
+ # finally, we also make our own sample_submission, same reasons as public test
24
+ # but match the format of what's online
25
+ sample_submission = gold_submission.copy()
26
+ sample_submission["Insult"] = 0
27
+ sample_submission.to_csv(public / "sample_submission_null.csv", index=False)
28
+
29
+ # checks
30
+ public_train = read_csv(public / "train.csv")
31
+ public_test = read_csv(public / "test.csv")
32
+ private_test = read_csv(private / "test.csv")
33
+
34
+ # no `Id` column in train, so we check comment content instead
35
+ assert public_train.columns.to_list() == [
36
+ "Insult",
37
+ "Date",
38
+ "Comment",
39
+ ], "Train columns should be Insult, Date, Comment"
40
+ assert public_test.columns.to_list() == [
41
+ "Date",
42
+ "Comment",
43
+ ], "Test columns should be Date, Comment"
44
+ assert sample_submission.columns.to_list() == [
45
+ "Insult",
46
+ "Date",
47
+ "Comment",
48
+ ], "Sample submission columns should be Insult, Date, Comment"
49
+ assert gold_submission.columns.to_list() == [
50
+ "Insult",
51
+ "Date",
52
+ "Comment",
53
+ ], "Gold submission columns should be Insult, Date, Comment"
54
+ assert private_test.columns.to_list() == [
55
+ "Insult",
56
+ "Date",
57
+ "Comment",
58
+ "Usage",
59
+ ], "Private test columns should be Insult, Date, Comment, Usage"
60
+
61
+ assert set(public_train["Comment"]).isdisjoint(
62
+ set(public_test["Comment"])
63
+ ), "None of the test comments should be in the train comments"
64
+ assert public_test.equals(
65
+ private_test.drop(columns=["Insult", "Usage"])
66
+ ), "Public test should be identical to private test, modulo the Insult and Usage columns"
67
+ assert set(public_test["Comment"]) == set(
68
+ sample_submission["Comment"]
69
+ ), "Public test and sample submission should have the same Comments"
70
+ assert set(public_test["Comment"]) == set(
71
+ gold_submission["Comment"]
72
+ ), "Public test and gold submission should have the same Comments"
@@ -0,0 +1,128 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def _create_split_files(
11
+ train_df: pd.DataFrame,
12
+ test_df_with_solutions: pd.DataFrame,
13
+ public_dir: Path,
14
+ private_dir: Path,
15
+ ):
16
+ """
17
+ Helper function to create the standard file structure for a given data split.
18
+
19
+ This function encapsulates the logic for generating:
20
+ - public/train.csv
21
+ - public/test.csv (unlabeled)
22
+ - public/sample_submission_null.csv
23
+ - private/test.csv (labeled, ground truth)
24
+ - private/gold_submission.csv
25
+ """
26
+ # Create directories if they don't exist
27
+ public_dir.mkdir(exist_ok=True)
28
+ private_dir.mkdir(exist_ok=True)
29
+
30
+ # Save the training data
31
+ train_df.to_csv(public_dir / "train.csv", index=False)
32
+
33
+ # Save the full test set with solutions to the private directory
34
+ test_df_with_solutions.to_csv(private_dir / "test.csv", index=False)
35
+
36
+ # Create the gold submission from the private test set
37
+ gold_submission = test_df_with_solutions[["Insult", "Date", "Comment"]].copy()
38
+ gold_submission.to_csv(private_dir / "gold_submission.csv", index=False)
39
+
40
+ # Create the public test set by dropping the label
41
+ public_test = gold_submission.drop(columns=["Insult"]).copy()
42
+ public_test.to_csv(public_dir / "test.csv", index=False)
43
+
44
+ # Create a sample submission with null labels
45
+ sample_submission = gold_submission.copy()
46
+ sample_submission["Insult"] = 0
47
+ sample_submission.to_csv(public_dir / "sample_submission_null.csv", index=False)
48
+
49
+
50
+ def prepare(raw: Path, public: Path, private: Path):
51
+ # Load the original, pre-split data from the raw directory
52
+ original_train_df = read_csv(raw / "train.csv")
53
+ original_test_df = read_csv(raw / "test_with_solutions.csv")
54
+
55
+ # --- Part 1: Generate the original public/private split ---
56
+ # This block uses the original data to create the competition's primary
57
+ # train/test split, ensuring the output is identical to the original script.
58
+ _create_split_files(original_train_df, original_test_df, public, private)
59
+
60
+ # --- Part 2: Generate the new validation split ---
61
+ # This block creates a new split for local validation. It takes the original
62
+ # training data and splits it again, creating a new, smaller training set
63
+ # and a validation set. The outputs are saved to parallel directories.
64
+
65
+ # Define paths for the new validation set directories
66
+ public_val = public.parent / "public_val"
67
+ private_val = private.parent / "private_val"
68
+
69
+ # Split the original training data to create a new train and validation set.
70
+ # The size of the validation set is chosen to be the same as the original test set.
71
+ # We stratify on the 'Insult' column to maintain label distribution.
72
+ train_val_df, test_val_df = train_test_split(
73
+ original_train_df,
74
+ test_size=len(original_test_df),
75
+ random_state=42,
76
+ stratify=original_train_df["Insult"],
77
+ )
78
+
79
+ # Use the same helper function to create the files for the validation split,
80
+ # ensuring an identical directory structure and naming convention.
81
+ _create_split_files(train_val_df, test_val_df, public_val, private_val)
82
+
83
+ # --- Final Checks ---
84
+ # checks for the original split
85
+ public_train = read_csv(public / "train.csv")
86
+ public_test = read_csv(public / "test.csv")
87
+ private_test = read_csv(private / "test.csv")
88
+ sample_submission = read_csv(public / "sample_submission_null.csv")
89
+ gold_submission = read_csv(private / "gold_submission.csv")
90
+
91
+ assert public_train.columns.to_list() == [
92
+ "Insult",
93
+ "Date",
94
+ "Comment",
95
+ ], "Train columns should be Insult, Date, Comment"
96
+ assert public_test.columns.to_list() == [
97
+ "Date",
98
+ "Comment",
99
+ ], "Test columns should be Date, Comment"
100
+ assert sample_submission.columns.to_list() == [
101
+ "Insult",
102
+ "Date",
103
+ "Comment",
104
+ ], "Sample submission columns should be Insult, Date, Comment"
105
+ assert gold_submission.columns.to_list() == [
106
+ "Insult",
107
+ "Date",
108
+ "Comment",
109
+ ], "Gold submission columns should be Insult, Date, Comment"
110
+ assert private_test.columns.to_list() == [
111
+ "Insult",
112
+ "Date",
113
+ "Comment",
114
+ "Usage",
115
+ ], "Private test columns should be Insult, Date, Comment, Usage"
116
+
117
+ assert set(public_train["Comment"]).isdisjoint(
118
+ set(public_test["Comment"])
119
+ ), "None of the test comments should be in the train comments"
120
+ assert public_test.equals(
121
+ private_test.drop(columns=["Insult", "Usage"])
122
+ ), "Public test should be identical to private test, modulo the Insult and Usage columns"
123
+ assert set(public_test["Comment"]) == set(
124
+ sample_submission["Comment"]
125
+ ), "Public test and sample submission should have the same Comments"
126
+ assert set(public_test["Comment"]) == set(
127
+ gold_submission["Comment"]
128
+ ), "Public test and gold submission should have the same Comments"