dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,199 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+
7
+
8
+ def get_date(s: str) -> str:
9
+ """Gets date from string in the format YYYY-MM-DD-X where `X` is an arbitrary string."""
10
+
11
+ split = s.split("-")
12
+
13
+ assert (
14
+ len(split) >= 3
15
+ ), f"Expected the string to have at least 3 parts separated by `-`. Got {len(split)} parts."
16
+
17
+ year, month, day = split[:3]
18
+
19
+ assert (
20
+ isinstance(year, str) and year.isdigit()
21
+ ), f"Expected the year to be a string of digits. Got {year} instead."
22
+
23
+ assert (
24
+ isinstance(month, str) and month.isdigit()
25
+ ), f"Expected the month to be a string of digits. Got {month} instead."
26
+
27
+ assert (
28
+ isinstance(day, str) and day.isdigit()
29
+ ), f"Expected the day to be a string of digits. Got {day} instead."
30
+
31
+ date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
32
+
33
+ return date
34
+
35
+
36
+ def _process_split(
37
+ raw_data_path: Path,
38
+ train_ids: list,
39
+ test_ids: list,
40
+ public_path: Path,
41
+ private_path: Path,
42
+ ) -> None:
43
+ """
44
+ Helper function to process a single data split.
45
+
46
+ It populates the public and private directories with the provided train/test IDs,
47
+ creating the necessary file structure and artifacts (like the sample submission).
48
+ """
49
+ # Clean and create output directories
50
+ shutil.rmtree(public_path, ignore_errors=True)
51
+ shutil.rmtree(private_path, ignore_errors=True)
52
+ public_path.mkdir(parents=True)
53
+ private_path.mkdir(parents=True)
54
+ (public_path / "train").mkdir()
55
+ (public_path / "test").mkdir()
56
+
57
+ for train_id in train_ids:
58
+ shutil.copytree(
59
+ src=raw_data_path / "train" / train_id,
60
+ dst=public_path / "train" / train_id,
61
+ )
62
+
63
+ for test_id in test_ids:
64
+ shutil.copytree(
65
+ src=raw_data_path / "train" / test_id,
66
+ dst=public_path / "test" / test_id,
67
+ )
68
+
69
+ # Construct test set by concatenating all ground truth csvs for the test journeys
70
+ dfs = []
71
+ for fpath in sorted((public_path / "test").rglob("ground_truth.csv")):
72
+ drive_id = fpath.parent.parent.name
73
+ phone_id = fpath.parent.name
74
+
75
+ assert (
76
+ drive_id in test_ids
77
+ ), f"Expected the drive {drive_id} to be one of the new test instances. Got {drive_id} instead."
78
+
79
+ raw_df = pd.read_csv(fpath)
80
+ df = raw_df.copy()
81
+ df.loc[:, "tripId"] = f"{drive_id}-{phone_id}"
82
+ df = df[["tripId", "UnixTimeMillis", "LatitudeDegrees", "LongitudeDegrees"]]
83
+ dfs.append(df)
84
+
85
+ new_test_labels = pd.concat(dfs, ignore_index=True)
86
+ # The output filename is 'test.csv' to match the competition structure.
87
+ new_test_labels.to_csv(private_path / "test.csv", index=False)
88
+
89
+ for fpath in (public_path / "test").rglob("ground_truth.csv"):
90
+ fpath.unlink() # don't include ground truth in public test data
91
+
92
+ shutil.copytree(
93
+ src=raw_data_path / "metadata",
94
+ dst=public_path / "metadata",
95
+ )
96
+
97
+ actual_journey_ids = set(["-".join(s.split("-")[:-1]) for s in new_test_labels["tripId"]])
98
+ assert len(actual_journey_ids) == len(test_ids), (
99
+ f"Expected the new test instances to have {len(test_ids)} unique trip IDs. Got "
100
+ f"{len(new_test_labels['tripId'].unique())} unique trip IDs."
101
+ )
102
+
103
+ sample_submission = new_test_labels.copy()
104
+ sample_submission.loc[:, "LatitudeDegrees"] = 37.904611315634504
105
+ sample_submission.loc[:, "LongitudeDegrees"] = -86.48107806249548
106
+
107
+ assert len(sample_submission) == len(new_test_labels), (
108
+ f"Expected the sample submission to have the same number of instances as the new test "
109
+ f"instances. Got {len(sample_submission)} instances in the sample submission and "
110
+ f"{len(new_test_labels)} new test instances."
111
+ )
112
+
113
+ sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
114
+
115
+ assert sorted(list(public_path.glob("train/*"))) == sorted(
116
+ set([public_path / "train" / drive_id for drive_id in train_ids])
117
+ ), "Expected the public train directory to contain the new train instances."
118
+
119
+ assert sorted(list(public_path.glob("test/*"))) == sorted(
120
+ set([public_path / "test" / drive_id for drive_id in test_ids])
121
+ ), "Expected the public test directory to contain the new test instances."
122
+
123
+ assert (
124
+ len(list((public_path / "test").rglob("ground_truth.csv"))) == 0
125
+ ), "Expected the public test directory to not contain any ground truth files."
126
+
127
+ assert len(list((public_path / "train").rglob("ground_truth.csv"))) >= len(train_ids), (
128
+ "Expected the public train directory to contain at least one ground truth file per new "
129
+ "train instance."
130
+ )
131
+
132
+
133
+ def prepare(raw: Path, public: Path, private: Path) -> None:
134
+ # --- Stage 1: Original Split (Train / Test) ---
135
+ # This section creates the primary competition data in `public` and `private`.
136
+ # Its logic and outputs are identical to the original script.
137
+
138
+ old_train_ids = sorted([folder.name for folder in (raw / "train").glob("*") if folder.is_dir()])
139
+ dates = sorted(set([get_date(s) for s in old_train_ids]))
140
+ new_train_dates, new_test_dates = train_test_split(dates, test_size=0.1, random_state=0)
141
+
142
+ assert (
143
+ len(new_train_dates) >= 1
144
+ ), "Expected the new train set to have at least one date. Got 0 dates."
145
+
146
+ assert (
147
+ len(new_test_dates) >= 1
148
+ ), "Expected the new test set to have at least one date. Got 0 dates."
149
+
150
+ new_train_ids = sorted([i for i in old_train_ids if get_date(i) in new_train_dates])
151
+ new_test_ids = sorted([i for i in old_train_ids if get_date(i) in new_test_dates])
152
+
153
+ assert len(set(new_train_ids).intersection(set(new_test_ids))) == 0, (
154
+ f"Expected the new train and test instances to be disjoint. Got an intersection of "
155
+ f"{set(new_train_ids).intersection(set(new_test_ids))}."
156
+ )
157
+
158
+ assert len(new_train_ids) + len(new_test_ids) == len(old_train_ids), (
159
+ f"Expected the number of new train and test instances to sum up to the number of old train "
160
+ f"instances. Got {len(new_train_ids)} new train instances and {len(new_test_ids)} new test "
161
+ f"instances which sum to {len(new_train_ids) + len(new_test_ids)} instead of "
162
+ f"{len(old_train_ids)}."
163
+ )
164
+
165
+ _process_split(
166
+ raw_data_path=raw,
167
+ train_ids=new_train_ids,
168
+ test_ids=new_test_ids,
169
+ public_path=public,
170
+ private_path=private,
171
+ )
172
+
173
+ # --- Stage 2: Validation Split (Train_val / Test_val) ---
174
+ # This section creates a new validation dataset in parallel directories.
175
+ # It takes the `new_train_ids` from the first split and splits them again
176
+ # using the exact same methodology.
177
+
178
+ public_val = public.parent / "public_val"
179
+ private_val = private.parent / "private_val"
180
+
181
+ # The input for this split is the training set from the *first* split.
182
+ val_split_input_dates = sorted(set([get_date(s) for s in new_train_ids]))
183
+
184
+ # We replicate the splitting logic and parameters to get a test set of a
185
+ # similar proportional size to the original test set.
186
+ train_val_dates, test_val_dates = train_test_split(
187
+ val_split_input_dates, test_size=0.1, random_state=0
188
+ )
189
+
190
+ train_val_ids = sorted([i for i in new_train_ids if get_date(i) in train_val_dates])
191
+ test_val_ids = sorted([i for i in new_train_ids if get_date(i) in test_val_dates])
192
+
193
+ _process_split(
194
+ raw_data_path=raw,
195
+ train_ids=train_val_ids,
196
+ test_ids=test_val_ids,
197
+ public_path=public_val,
198
+ private_path=private_val,
199
+ )
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="Transported", id_column="PassengerId"
10
+ )
11
+ return accuracy_score(**accuracy_inputs)
@@ -0,0 +1,23 @@
1
+ from pathlib import Path
2
+
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ from mlebench.utils import read_csv
6
+
7
+
8
+ def prepare(raw: Path, public: Path, private: Path):
9
+ # Create train and test splits from train set
10
+ old_train = read_csv(raw / "train.csv")
11
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
12
+
13
+ # Create random example submission
14
+ example_submission = new_test[["PassengerId", "Transported"]].copy()
15
+ example_submission["Transported"] = False
16
+ example_submission.to_csv(public / "sample_submission.csv", index=False)
17
+
18
+ # Create private files
19
+ new_test.to_csv(private / "test.csv", index=False)
20
+
21
+ # Create public files visible to agents
22
+ new_train.to_csv(public / "train.csv", index=False)
23
+ new_test.drop("Transported", axis="columns").to_csv(public / "test.csv", index=False)
@@ -0,0 +1,61 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.utils import read_csv
7
+
8
+
9
+ def _create_split_files(
10
+ train_df: pd.DataFrame,
11
+ test_df: pd.DataFrame,
12
+ public_dir: Path,
13
+ private_dir: Path,
14
+ ):
15
+ """A helper function to save train/test splits and artifacts."""
16
+ # Create random example submission
17
+ example_submission = test_df[["PassengerId", "Transported"]].copy()
18
+ example_submission["Transported"] = False
19
+ example_submission.to_csv(public_dir / "sample_submission.csv", index=False)
20
+
21
+ # Create private files
22
+ test_df.to_csv(private_dir / "test.csv", index=False)
23
+
24
+ # Create public files visible to agents
25
+ train_df.to_csv(public_dir / "train.csv", index=False)
26
+ test_df.drop("Transported", axis="columns").to_csv(
27
+ public_dir / "test.csv", index=False
28
+ )
29
+
30
+
31
+ def prepare(raw: Path, public: Path, private: Path):
32
+ # Define and create all output directories to ensure they exist
33
+ public.mkdir(parents=True, exist_ok=True)
34
+ private.mkdir(parents=True, exist_ok=True)
35
+ public_val = public.parent / "public_val"
36
+ private_val = private.parent / "private_val"
37
+ public_val.mkdir(parents=True, exist_ok=True)
38
+ private_val.mkdir(parents=True, exist_ok=True)
39
+
40
+ # --- Original Split: Create main train and test sets ---
41
+ # Create train and test splits from train set
42
+ old_train = read_csv(raw / "train.csv")
43
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
44
+
45
+ # Generate the files for the original public/private directories
46
+ # This ensures the original outputs are unchanged
47
+ _create_split_files(new_train, new_test, public, private)
48
+
49
+ # --- Validation Split: Create a new train and validation set ---
50
+ # The new validation set should be approx. the same size as the original test set.
51
+ # We calculate the required test_size for splitting the new_train set.
52
+ test_size_for_val = len(new_test) / len(new_train)
53
+
54
+ # Split the training data again to create a new, smaller training set and a validation set
55
+ train_val, test_val = train_test_split(
56
+ new_train, test_size=test_size_for_val, random_state=0
57
+ )
58
+
59
+ # Generate the files for the new validation directories, using the exact same
60
+ # structure and filenames as the original split.
61
+ _create_split_files(train_val, test_val, public_val, private_val)
@@ -0,0 +1 @@
1
+ CLASSES = ["EAP", "HPL", "MWS"]
@@ -0,0 +1,38 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import log_loss
4
+
5
+ from mlebench.competitions.utils import one_hot_dfs_to_log_loss_inputs
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+ from .classes import CLASSES
9
+
10
+ tolerance = 1e-6
11
+
12
+
13
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict:
14
+ id_col = "id"
15
+
16
+ if submission.shape != (len(answers), len(CLASSES) + 1):
17
+ raise InvalidSubmissionError(
18
+ f"Submission shape {submission.shape} does not match answers shape {answers.shape}."
19
+ )
20
+
21
+ # assert that each row in submission sums to one, accounting for floating point errors
22
+ if not np.all(np.isclose(submission.iloc[:, 1:].sum(axis=1), 1, atol=tolerance)):
23
+ raise InvalidSubmissionError("Each row in submission should sum to one, as probabilities.")
24
+ if not ((submission.iloc[:, 1:] >= 0) & (submission.iloc[:, 1:] <= 1)).all().all():
25
+ raise InvalidSubmissionError(
26
+ "All probabilities in submission DataFrame must be between 0 and 1."
27
+ )
28
+
29
+ log_loss_inputs = one_hot_dfs_to_log_loss_inputs(
30
+ submission, answers, id_column=id_col, apply_softmax=False
31
+ )
32
+
33
+ return log_loss_inputs
34
+
35
+
36
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
37
+ log_loss_inputs = prepare_for_metric(submission, answers)
38
+ return log_loss(**log_loss_inputs)
@@ -0,0 +1,40 @@
1
+ from pathlib import Path
2
+
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ from mlebench.competitions.utils import df_to_one_hot
6
+ from mlebench.utils import extract, read_csv
7
+
8
+ from .classes import CLASSES
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ target_col = "author"
13
+ id_col = "id"
14
+
15
+ # extract only what we need
16
+ extract(raw / "train.zip", raw)
17
+
18
+ # Create train, test from train split
19
+ old_train = read_csv(raw / "train.csv")
20
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
21
+ new_test_without_labels = new_test.drop(columns=[target_col])
22
+
23
+ # private test matches the format of sample submission
24
+ one_hot_new_test = df_to_one_hot(
25
+ new_test.drop(columns=["text"]),
26
+ id_column=id_col,
27
+ target_column=target_col,
28
+ classes=CLASSES,
29
+ )
30
+ # fill the sample submission with arbitrary values (matching kaggle.com)
31
+ sample_submission = one_hot_new_test.copy()
32
+ sample_submission["EAP"] = 0.403493538995863
33
+ sample_submission["HPL"] = 0.287808366106543
34
+ sample_submission["MWS"] = 0.308698094897594
35
+
36
+ # save files
37
+ new_train.to_csv(public / "train.csv", index=False)
38
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
39
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
40
+ one_hot_new_test.to_csv(private / "test.csv", index=False)
@@ -0,0 +1,78 @@
1
+ from pathlib import Path
2
+
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ from mlebench.competitions.utils import df_to_one_hot
6
+ from mlebench.utils import extract, read_csv
7
+
8
+ from .classes import CLASSES
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ target_col = "author"
13
+ id_col = "id"
14
+
15
+ # extract only what we need
16
+ extract(raw / "train.zip", raw)
17
+
18
+ # =================================================================
19
+ # == Original Data Split (for `public` and `private` directories)
20
+ # =================================================================
21
+ # Create main train/test split from the raw data
22
+ old_train = read_csv(raw / "train.csv")
23
+ train_main, test_main = train_test_split(old_train, test_size=0.1, random_state=0)
24
+ test_main_without_labels = test_main.drop(columns=[target_col])
25
+
26
+ # private test matches the format of sample submission
27
+ one_hot_test_main = df_to_one_hot(
28
+ test_main.drop(columns=["text"]),
29
+ id_column=id_col,
30
+ target_column=target_col,
31
+ classes=CLASSES,
32
+ )
33
+ # fill the sample submission with arbitrary values (matching kaggle.com)
34
+ sample_submission = one_hot_test_main.copy()
35
+ sample_submission["EAP"] = 0.403493538995863
36
+ sample_submission["HPL"] = 0.287808366106543
37
+ sample_submission["MWS"] = 0.308698094897594
38
+
39
+ # save files to original public/private directories
40
+ train_main.to_csv(public / "train.csv", index=False)
41
+ test_main_without_labels.to_csv(public / "test.csv", index=False)
42
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
43
+ one_hot_test_main.to_csv(private / "test.csv", index=False)
44
+
45
+ # =====================================================================
46
+ # == New Validation Split (for `public_val` and `private_val` dirs)
47
+ # =====================================================================
48
+ # Define and create the new parallel directories for the validation set
49
+ public_val = public.parent / "public_val"
50
+ private_val = private.parent / "private_val"
51
+ public_val.mkdir(parents=True, exist_ok=True)
52
+ private_val.mkdir(parents=True, exist_ok=True)
53
+
54
+ # Split the main training set again to create a new, smaller training set and a validation set.
55
+ # The logic and random_state are replicated to ensure the validation set size is
56
+ # consistent with the main test set size.
57
+ train_val, test_val = train_test_split(train_main, test_size=0.1, random_state=0)
58
+ test_val_without_labels = test_val.drop(columns=[target_col])
59
+
60
+ # Process the new validation set's labels, mirroring the original logic
61
+ one_hot_test_val = df_to_one_hot(
62
+ test_val.drop(columns=["text"]),
63
+ id_column=id_col,
64
+ target_column=target_col,
65
+ classes=CLASSES,
66
+ )
67
+
68
+ # Create a new sample submission for the validation set
69
+ sample_submission_val = one_hot_test_val.copy()
70
+ sample_submission_val["EAP"] = 0.403493538995863
71
+ sample_submission_val["HPL"] = 0.287808366106543
72
+ sample_submission_val["MWS"] = 0.308698094897594
73
+
74
+ # Save files to the new validation directories using identical filenames
75
+ train_val.to_csv(public_val / "train.csv", index=False)
76
+ test_val_without_labels.to_csv(public_val / "test.csv", index=False)
77
+ sample_submission_val.to_csv(public_val / "sample_submission.csv", index=False)
78
+ one_hot_test_val.to_csv(private_val / "test.csv", index=False)
@@ -0,0 +1,65 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import root_mean_squared_error
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
9
+ if len(submission) != len(answers):
10
+ raise InvalidSubmissionError(
11
+ f"Expected submission to be the same length as answers, but got {len(submission)} "
12
+ f"instead of {len(answers)}."
13
+ )
14
+
15
+ to_predict = ["reactivity", "deg_Mg_pH10", "deg_Mg_50C"]
16
+ expected_answer_columns = ["id_seqpos"] + to_predict + ["keep"]
17
+
18
+ assert set(answers.columns).issuperset(expected_answer_columns), (
19
+ f"Expected answers to have columns {expected_answer_columns}, but instead it has "
20
+ f"columns {answers.columns}."
21
+ )
22
+
23
+ # The submission csv contains two columns which aren't used for scoring: `deg_pH10` and
24
+ # `deg_50C`. These are nonetheless still required to be in the submission as per the
25
+ # competition rules. See the "Sample Submission" section of the competition overview page for
26
+ # more information:
27
+ # https://www.kaggle.com/competitions/stanford-covid-vaccine/overview/evaluation
28
+ expected_submission_columns = ["id_seqpos"] + to_predict + ["deg_pH10", "deg_50C"]
29
+
30
+ if not set(submission.columns).issuperset(expected_submission_columns):
31
+ raise InvalidSubmissionError(
32
+ f"Expected the submission to have columns {expected_submission_columns}, but instead "
33
+ f"it has columns {submission.columns}."
34
+ )
35
+
36
+ filtered_submission = submission[expected_submission_columns]
37
+
38
+ # Sort rows by `id_seqpos` and columns alphabetically
39
+ sorted_submission = filtered_submission.sort_values(by="id_seqpos").sort_index(axis=1)
40
+ sorted_answers = answers.sort_values(by="id_seqpos").sort_index(axis=1)
41
+
42
+ for i, (actual_id, expected_id) in enumerate(
43
+ zip(sorted_submission["id_seqpos"], sorted_answers["id_seqpos"])
44
+ ):
45
+ if actual_id == expected_id:
46
+ continue
47
+
48
+ raise InvalidSubmissionError(
49
+ f"Expected submission to have the same `id_seqpos` as answers, but got `{actual_id}` "
50
+ f"instead of `{expected_id}` on row {i} of the submission."
51
+ )
52
+
53
+ mask = sorted_answers["keep"]
54
+ new_submission = sorted_submission[mask]
55
+ new_answers = sorted_answers[mask]
56
+
57
+ errors = []
58
+
59
+ for column in to_predict:
60
+ y_pred = new_submission[column]
61
+ y_true = new_answers[column]
62
+ error = root_mean_squared_error(y_true=y_true, y_pred=y_pred)
63
+ errors.append(error)
64
+
65
+ return np.mean(errors)
@@ -0,0 +1,129 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def prepare(raw: Path, public: Path, private: Path) -> None:
7
+ old_train = pd.read_json(raw / "train.json", lines=True)
8
+ old_test = pd.read_json(raw / "test.json", lines=True)
9
+ old_sample_submission = pd.read_csv(raw / "sample_submission.csv")
10
+
11
+ to_predict = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
12
+ test_size = 0.1
13
+ n_test_samples = int(len(old_train) * test_size)
14
+
15
+ # only put samples that pass the SN filter in the test set, as per comp data desc
16
+ old_train["test"] = False
17
+ test_indices = (
18
+ old_train[old_train["SN_filter"] > 0].sample(n=n_test_samples, random_state=0).index
19
+ )
20
+ old_train.loc[test_indices, "test"] = True
21
+
22
+ new_train = old_train[~old_train["test"]].copy().drop(columns=["test"], inplace=False)
23
+ new_test = old_train[old_train["test"]].copy().drop(columns=["test"], inplace=False)
24
+ old_train = old_train.drop(columns=["test"], inplace=False)
25
+
26
+ # Create `test.csv` by exploding each list in the `reactivity` and `deg_*` columns, analogous
27
+ # to `pd.explode`. Only the first `seq_scored` items are scored out of a possible `seq_length`
28
+ # items. For each row, we keep track of whether it's scored or not with the `keep` column.
29
+ records = []
30
+
31
+ for _, row in new_test.iterrows():
32
+ n = row["seq_scored"]
33
+
34
+ assert len(row["reactivity"]) == n
35
+ assert len(row["deg_Mg_pH10"]) == n
36
+ assert len(row["deg_pH10"]) == n
37
+ assert len(row["deg_Mg_50C"]) == n
38
+ assert len(row["deg_50C"]) == n
39
+
40
+ for j in range(n):
41
+ records.append(
42
+ {
43
+ "id_seqpos": f"{row['id']}_{j}",
44
+ "reactivity": row["reactivity"][j],
45
+ "deg_Mg_pH10": row["deg_Mg_pH10"][j],
46
+ "deg_pH10": row["deg_pH10"][j],
47
+ "deg_Mg_50C": row["deg_Mg_50C"][j],
48
+ "deg_50C": row["deg_50C"][j],
49
+ "keep": True,
50
+ }
51
+ )
52
+
53
+ k = row["seq_length"]
54
+
55
+ assert n < k
56
+
57
+ for j in range(n, k):
58
+ records.append(
59
+ {
60
+ "id_seqpos": f"{row['id']}_{j}",
61
+ "reactivity": 0.0,
62
+ "deg_Mg_pH10": 0.0,
63
+ "deg_pH10": 0.0,
64
+ "deg_Mg_50C": 0.0,
65
+ "deg_50C": 0.0,
66
+ "keep": False,
67
+ }
68
+ )
69
+
70
+ # Write `answers.csv`
71
+ answers = pd.DataFrame(records)
72
+ answers.to_csv(private / "test.csv", index=False, float_format="%.10f")
73
+
74
+ # Write `train.json`
75
+ new_train["index"] = range(len(new_train))
76
+ new_train.to_json(public / "train.json", orient="records", lines=True)
77
+
78
+ # Write `test.json`
79
+ new_test_without_labels = new_test[old_test.columns].copy()
80
+ new_test_without_labels["index"] = range(len(new_test_without_labels))
81
+ new_test_without_labels.to_json(public / "test.json", orient="records", lines=True)
82
+
83
+ # Write `sample_submission.csv`
84
+ new_sample_submission = answers[["id_seqpos"] + to_predict].copy()
85
+ new_sample_submission.loc[:, to_predict] = 0.0
86
+ new_sample_submission.to_csv(
87
+ public / "sample_submission.csv", index=False, float_format="%.10f"
88
+ )
89
+
90
+ # Sanity checks
91
+ assert set(new_train.columns) == set(old_train.columns), (
92
+ f"Expected the columns of the new train to be the same as the old train, but got "
93
+ f"{set(new_train.columns)} instead of {set(old_train.columns)}."
94
+ )
95
+
96
+ assert set(new_test_without_labels.columns) == set(old_test.columns), (
97
+ f"Expected the columns of the new test to be the same as the old test, but got "
98
+ f"{set(new_test_without_labels.columns)} instead of {set(old_test.columns)}."
99
+ )
100
+
101
+ assert set(to_predict).intersection(set(new_test_without_labels.columns)) == set(), (
102
+ f"Expected the columns to predict aren't included in the new test, but got "
103
+ f"{set(to_predict) ^ set(new_test_without_labels.columns)} instead of the empty set."
104
+ )
105
+
106
+ assert set(new_sample_submission.columns) == set(old_sample_submission.columns), (
107
+ f"Expected the columns of the new sample submission to be the same as the old sample "
108
+ f"submission, but got {set(new_sample_submission.columns)} instead of "
109
+ f"{set(old_sample_submission.columns)}."
110
+ )
111
+
112
+ assert len(answers) == len(new_sample_submission), (
113
+ f"Expected the answers to have the same length as the new sample submission, but got "
114
+ f"{len(answers)} instead of {len(new_sample_submission)}."
115
+ )
116
+
117
+ # we can use [0] because all sequences have the same length
118
+ assert len(new_sample_submission) == (
119
+ len(new_test_without_labels) * new_test_without_labels["seq_length"].iloc[0]
120
+ ), (
121
+ "Expected new_sample_submission length to be equal to max seq_length * len(new_test)."
122
+ f"Got {len(new_sample_submission)} instead of {len(new_test_without_labels) * new_test_without_labels['seq_length']}."
123
+ )
124
+
125
+ assert len(new_train) + len(new_test) == len(old_train), (
126
+ f"Expected the length of the new train set plus the length of the new test set to be "
127
+ f"equal to the length of the old train set, but got {len(new_train) + len(new_test)} "
128
+ f"instead of {len(old_train)}."
129
+ )