dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,199 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def _create_competition_files(
7
+ train_df: pd.DataFrame,
8
+ test_df: pd.DataFrame,
9
+ old_test_columns: pd.Index,
10
+ old_sample_submission: pd.DataFrame,
11
+ public_dir: Path,
12
+ private_dir: Path,
13
+ to_predict: list,
14
+ ) -> None:
15
+ """
16
+ Helper function to generate the set of competition files for a given train/test split.
17
+ This function creates the public and private directories and populates them with:
18
+ - public/train.json
19
+ - public/test.json
20
+ - public/sample_submission.csv
21
+ - private/test.csv (ground truth)
22
+ """
23
+ public_dir.mkdir(parents=True, exist_ok=True)
24
+ private_dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ # Create `test.csv` by exploding each list in the `reactivity` and `deg_*` columns, analogous
27
+ # to `pd.explode`. Only the first `seq_scored` items are scored out of a possible `seq_length`
28
+ # items. For each row, we keep track of whether it's scored or not with the `keep` column.
29
+ records = []
30
+
31
+ for _, row in test_df.iterrows():
32
+ n = row["seq_scored"]
33
+
34
+ assert len(row["reactivity"]) == n
35
+ assert len(row["deg_Mg_pH10"]) == n
36
+ assert len(row["deg_pH10"]) == n
37
+ assert len(row["deg_Mg_50C"]) == n
38
+ assert len(row["deg_50C"]) == n
39
+
40
+ for j in range(n):
41
+ records.append(
42
+ {
43
+ "id_seqpos": f"{row['id']}_{j}",
44
+ "reactivity": row["reactivity"][j],
45
+ "deg_Mg_pH10": row["deg_Mg_pH10"][j],
46
+ "deg_pH10": row["deg_pH10"][j],
47
+ "deg_Mg_50C": row["deg_Mg_50C"][j],
48
+ "deg_50C": row["deg_50C"][j],
49
+ "keep": True,
50
+ }
51
+ )
52
+
53
+ k = row["seq_length"]
54
+
55
+ assert n < k
56
+
57
+ for j in range(n, k):
58
+ records.append(
59
+ {
60
+ "id_seqpos": f"{row['id']}_{j}",
61
+ "reactivity": 0.0,
62
+ "deg_Mg_pH10": 0.0,
63
+ "deg_pH10": 0.0,
64
+ "deg_Mg_50C": 0.0,
65
+ "deg_50C": 0.0,
66
+ "keep": False,
67
+ }
68
+ )
69
+
70
+ # Write `answers.csv`
71
+ answers = pd.DataFrame(records)
72
+ answers.to_csv(private_dir / "test.csv", index=False, float_format="%.10f")
73
+
74
+ # Write `train.json`
75
+ train_df["index"] = range(len(train_df))
76
+ train_df.to_json(public_dir / "train.json", orient="records", lines=True)
77
+
78
+ # Write `test.json`
79
+ test_without_labels = test_df[old_test_columns].copy()
80
+ test_without_labels["index"] = range(len(test_without_labels))
81
+ test_without_labels.to_json(public_dir / "test.json", orient="records", lines=True)
82
+
83
+ # Write `sample_submission.csv`
84
+ new_sample_submission = answers[["id_seqpos"] + to_predict].copy()
85
+ new_sample_submission.loc[:, to_predict] = 0.0
86
+ new_sample_submission.to_csv(
87
+ public_dir / "sample_submission.csv", index=False, float_format="%.10f"
88
+ )
89
+
90
+ # Sanity checks
91
+ assert "test" not in train_df.columns
92
+ assert "test" not in test_df.columns
93
+
94
+ assert set(test_without_labels.columns) == set(old_test_columns), (
95
+ f"Expected the columns of the new test to be the same as the old test, but got "
96
+ f"{set(test_without_labels.columns)} instead of {set(old_test_columns)}."
97
+ )
98
+
99
+ assert set(to_predict).intersection(set(test_without_labels.columns)) == set(), (
100
+ f"Expected the columns to predict aren't included in the new test, but got "
101
+ f"{set(to_predict) ^ set(test_without_labels.columns)} instead of the empty set."
102
+ )
103
+
104
+ assert set(new_sample_submission.columns) == set(old_sample_submission.columns), (
105
+ f"Expected the columns of the new sample submission to be the same as the old sample "
106
+ f"submission, but got {set(new_sample_submission.columns)} instead of "
107
+ f"{set(old_sample_submission.columns)}."
108
+ )
109
+
110
+ assert len(answers) == len(new_sample_submission), (
111
+ f"Expected the answers to have the same length as the new sample submission, but got "
112
+ f"{len(answers)} instead of {len(new_sample_submission)}."
113
+ )
114
+
115
+ # we can use [0] because all sequences have the same length
116
+ assert len(new_sample_submission) == (
117
+ len(test_without_labels) * test_without_labels["seq_length"].iloc[0]
118
+ ), (
119
+ "Expected new_sample_submission length to be equal to max seq_length * len(new_test)."
120
+ f"Got {len(new_sample_submission)} instead of {len(test_without_labels) * test_without_labels['seq_length']}."
121
+ )
122
+
123
+
124
+ def prepare(raw: Path, public: Path, private: Path) -> None:
125
+ old_train = pd.read_json(raw / "train.json", lines=True)
126
+ old_test = pd.read_json(raw / "test.json", lines=True)
127
+ old_sample_submission = pd.read_csv(raw / "sample_submission.csv")
128
+
129
+ to_predict = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
130
+ test_size = 0.1
131
+ n_test_samples = int(len(old_train) * test_size)
132
+
133
+ # First split: Create the main train and test sets from the raw data
134
+ # only put samples that pass the SN filter in the test set, as per comp data desc
135
+ old_train["test"] = False
136
+ test_indices = (
137
+ old_train[old_train["SN_filter"] > 0].sample(n=n_test_samples, random_state=0).index
138
+ )
139
+ old_train.loc[test_indices, "test"] = True
140
+
141
+ new_train = old_train[~old_train["test"]].copy().drop(columns=["test"])
142
+ new_test = old_train[old_train["test"]].copy().drop(columns=["test"])
143
+
144
+ # Generate the original competition files. This ensures the contents of `public`
145
+ # and `private` directories remain identical to the original script's output.
146
+ _create_competition_files(
147
+ train_df=new_train,
148
+ test_df=new_test,
149
+ old_test_columns=old_test.columns,
150
+ old_sample_submission=old_sample_submission,
151
+ public_dir=public,
152
+ private_dir=private,
153
+ to_predict=to_predict,
154
+ )
155
+
156
+ # --- New: Create a validation set by splitting the training set further ---
157
+
158
+ # Define new directories for the validation set, parallel to the original ones
159
+ public_val = public.parent / "public_val"
160
+ private_val = private.parent / "private_val"
161
+
162
+ # Second split: Create a new, smaller training set and a validation set.
163
+ # The logic and size are replicated from the first split for consistency.
164
+ val_split_base = new_train.copy()
165
+ val_split_base["test"] = False
166
+ val_test_indices = (
167
+ val_split_base[val_split_base["SN_filter"] > 0]
168
+ .sample(n=n_test_samples, random_state=0) # Use same size and random state
169
+ .index
170
+ )
171
+ val_split_base.loc[val_test_indices, "test"] = True
172
+
173
+ train_val = val_split_base[~val_split_base["test"]].copy().drop(columns=["test"])
174
+ test_val = val_split_base[val_split_base["test"]].copy().drop(columns=["test"])
175
+
176
+ # Generate the validation competition files in the new directories.
177
+ # The file structure and names inside public_val/private_val will mirror
178
+ # the original public/private directories.
179
+ _create_competition_files(
180
+ train_df=train_val,
181
+ test_df=test_val,
182
+ old_test_columns=old_test.columns,
183
+ old_sample_submission=old_sample_submission,
184
+ public_dir=public_val,
185
+ private_dir=private_val,
186
+ to_predict=to_predict,
187
+ )
188
+
189
+ # Final sanity checks
190
+ assert len(new_train) + len(new_test) == len(old_train), (
191
+ f"Expected the length of the new train set plus the length of the new test set to be "
192
+ f"equal to the length of the old train set, but got {len(new_train) + len(new_test)} "
193
+ f"instead of {len(old_train)}."
194
+ )
195
+ assert len(train_val) + len(test_val) == len(new_train), (
196
+ f"Expected the length of the validation train set plus the length of the validation test set to be "
197
+ f"equal to the length of the main train set, but got {len(train_val) + len(test_val)} "
198
+ f"instead of {len(new_train)}."
199
+ )
@@ -0,0 +1,41 @@
1
+ from typing import Tuple
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.metrics import log_loss
6
+
7
+ from mlebench.grade_helpers import InvalidSubmissionError
8
+
9
+
10
+ def prepare_for_metric(
11
+ submission: pd.DataFrame, answers: pd.DataFrame
12
+ ) -> Tuple[np.ndarray, np.ndarray]:
13
+ # answers checks
14
+ assert "id" in answers.columns, f"Answers is missing `id` column"
15
+ assert "is_iceberg" in answers.columns, f"Answers is missing `is_iceberg` column"
16
+
17
+ # submission checks
18
+ if "id" not in submission.columns:
19
+ raise InvalidSubmissionError(f"Submission is missing `id` column")
20
+ if "is_iceberg" not in submission.columns:
21
+ raise InvalidSubmissionError(f"Submission is missing `is_iceberg` column")
22
+ if set(submission["id"]) != set(answers["id"]):
23
+ raise InvalidSubmissionError("Submission and answers have different id's")
24
+ if not ((submission["is_iceberg"] >= 0) & (submission["is_iceberg"] <= 1)).all():
25
+ raise InvalidSubmissionError(
26
+ "All values in submission `is_iceberg` must be between 0 and 1."
27
+ )
28
+
29
+ # sort by id to ensure correct order
30
+ submission = submission.sort_values("id")
31
+ answers = answers.sort_values("id")
32
+
33
+ y_true = answers["is_iceberg"]
34
+ y_pred = submission["is_iceberg"]
35
+ return y_true, y_pred
36
+
37
+
38
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
39
+ y_true, y_pred = prepare_for_metric(submission, answers)
40
+ score = log_loss(y_true, y_pred)
41
+ return score
@@ -0,0 +1,105 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ import py7zr
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ from mlebench.utils import extract
10
+
11
+
12
+ def prepare(raw: Path, public: Path, private: Path):
13
+ """
14
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
15
+ """
16
+ extract(raw / "train.json.7z", raw)
17
+ extract(raw / "test.json.7z", raw)
18
+ old_train = json.load((raw / "data/processed/train.json").open())
19
+ old_test = json.load((raw / "data/processed/test.json").open())
20
+
21
+ all_fields = list([key for key in old_train[0].keys()])
22
+ assert all(
23
+ set(all_fields) == set([key for key in sample.keys()]) for sample in old_train
24
+ ), "Inconsistent fields in train set"
25
+ test_fields = list([key for key in old_test[0].keys()])
26
+ assert all(
27
+ set(test_fields) == set([key for key in sample.keys()]) for sample in old_test
28
+ ), "Inconsistent fields in test set"
29
+
30
+ # Old ratio is Train set - 1,604 samples; Test set - 8,424 samples (~84% ratio)
31
+ # We do a 20% ratio to avoid removing too many samples from train
32
+ new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
33
+ new_test_without_labels = []
34
+ for sample in new_test:
35
+ # Keep only the fields in the test set
36
+ sample = {key: sample[key] for key in test_fields}
37
+ new_test_without_labels.append(sample)
38
+
39
+ # Write new train and test splits, compress, then remove the uncompressed files
40
+ (private / "tmp_data").mkdir(exist_ok=True)
41
+ with open(private / "tmp_data" / "train.json", "w") as f:
42
+ json.dump(new_train, f)
43
+ with open(private / "tmp_data" / "test.json", "w") as f:
44
+ json.dump(new_test_without_labels, f)
45
+
46
+ with py7zr.SevenZipFile(public / "train.json.7z", "w") as archive:
47
+ archive.write(
48
+ private / "tmp_data" / "train.json",
49
+ arcname=(private / "tmp_data" / "train.json").relative_to(private / "tmp_data"),
50
+ )
51
+
52
+ with py7zr.SevenZipFile(public / "test.json.7z", "w") as archive:
53
+ archive.write(
54
+ private / "tmp_data" / "test.json",
55
+ arcname=(private / "tmp_data" / "test.json").relative_to(private / "tmp_data"),
56
+ )
57
+
58
+ # Make answers as csv from json
59
+ answer_rows = []
60
+ for sample in new_test:
61
+ answer_rows.append(
62
+ {
63
+ "id": sample["id"],
64
+ "is_iceberg": int(sample["is_iceberg"]),
65
+ }
66
+ )
67
+ answers = pd.DataFrame(answer_rows)
68
+ answers.to_csv(private / "test.csv", index=False)
69
+
70
+ # Make sample submission
71
+ sample_submission = answers.copy()
72
+ sample_submission["is_iceberg"] = 0.5
73
+ sample_submission.to_csv(private / "sample_submission.csv", index=False)
74
+ with py7zr.SevenZipFile(public / "sample_submission.csv.7z", "w") as archive:
75
+ archive.write(
76
+ private / "sample_submission.csv",
77
+ arcname=(private / "sample_submission.csv").relative_to(private),
78
+ )
79
+
80
+ # Remove uncompressed files
81
+ shutil.rmtree(private / "tmp_data")
82
+
83
+ # Checks
84
+ assert len(new_train) + len(new_test) == len(
85
+ old_train
86
+ ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
87
+ assert len(new_test) == len(
88
+ new_test_without_labels
89
+ ), f"Expected new_test ({len(new_test)}) to have the same length as new_test_without_labels ({len(new_test_without_labels)})"
90
+ assert len(answers) == len(
91
+ new_test
92
+ ), f"Expected answers ({len(answers)}) to have the same length as new_test ({len(new_test)})"
93
+ assert len(sample_submission) == len(
94
+ new_test
95
+ ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as new_test ({len(new_test)})"
96
+ assert set(answers.columns) == set(
97
+ ["id", "is_iceberg"]
98
+ ), "Answers must have 'id' and 'is_iceberg' columns"
99
+ assert set(sample_submission.columns) == set(
100
+ ["id", "is_iceberg"]
101
+ ), "Sample submission must have 'id' and 'is_iceberg' columns"
102
+
103
+ new_train_ids = set([sample["id"] for sample in new_train])
104
+ new_test_ids = set([sample["id"] for sample in new_test])
105
+ assert new_train_ids.isdisjoint(new_test_ids), "Train and test ids should not overlap"
@@ -0,0 +1,157 @@
1
+ import json
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+
6
+ import pandas as pd
7
+ import py7zr
8
+ from sklearn.model_selection import train_test_split
9
+
10
+ from mlebench.utils import extract
11
+
12
+
13
+ def _create_split_files(
14
+ train_set: List[Dict],
15
+ test_set: List[Dict],
16
+ test_fields: List[str],
17
+ public_dir: Path,
18
+ private_dir: Path,
19
+ ):
20
+ """
21
+ Helper function to generate all files for a given train/test split.
22
+
23
+ This function creates:
24
+ - public/{train.json.7z, test.json.7z, sample_submission.csv.7z}
25
+ - private/{test.csv, sample_submission.csv}
26
+ """
27
+ # Ensure output directories exist
28
+ public_dir.mkdir(exist_ok=True, parents=True)
29
+ private_dir.mkdir(exist_ok=True, parents=True)
30
+
31
+ # Create test set without labels
32
+ test_set_without_labels = []
33
+ for sample in test_set:
34
+ sample_copy = {key: sample[key] for key in test_fields}
35
+ test_set_without_labels.append(sample_copy)
36
+
37
+ # Write new train and test splits to a temporary directory
38
+ tmp_data_dir = private_dir / "tmp_data"
39
+ tmp_data_dir.mkdir(exist_ok=True)
40
+ try:
41
+ with open(tmp_data_dir / "train.json", "w") as f:
42
+ json.dump(train_set, f)
43
+ with open(tmp_data_dir / "test.json", "w") as f:
44
+ json.dump(test_set_without_labels, f)
45
+
46
+ # Compress the JSON files into the public directory
47
+ with py7zr.SevenZipFile(public_dir / "train.json.7z", "w") as archive:
48
+ archive.write(
49
+ tmp_data_dir / "train.json",
50
+ arcname="train.json",
51
+ )
52
+ with py7zr.SevenZipFile(public_dir / "test.json.7z", "w") as archive:
53
+ archive.write(
54
+ tmp_data_dir / "test.json",
55
+ arcname="test.json",
56
+ )
57
+
58
+ # Make answers as csv from the labeled test set
59
+ answer_rows = [
60
+ {"id": sample["id"], "is_iceberg": int(sample["is_iceberg"])}
61
+ for sample in test_set
62
+ ]
63
+ answers = pd.DataFrame(answer_rows)
64
+ answers.to_csv(private_dir / "test.csv", index=False)
65
+
66
+ # Make a sample submission file
67
+ sample_submission = answers.copy()
68
+ sample_submission["is_iceberg"] = 0.5
69
+ sample_submission.to_csv(private_dir / "sample_submission.csv", index=False)
70
+ with py7zr.SevenZipFile(public_dir / "sample_submission.csv.7z", "w") as archive:
71
+ archive.write(
72
+ private_dir / "sample_submission.csv",
73
+ arcname="sample_submission.csv",
74
+ )
75
+ finally:
76
+ # Ensure temporary files are removed
77
+ shutil.rmtree(tmp_data_dir)
78
+
79
+ # --- Final checks for this split ---
80
+ assert len(test_set) == len(
81
+ test_set_without_labels
82
+ ), f"Expected test_set ({len(test_set)}) to have the same length as test_set_without_labels ({len(test_set_without_labels)})"
83
+ assert len(answers) == len(
84
+ test_set
85
+ ), f"Expected answers ({len(answers)}) to have the same length as test_set ({len(test_set)})"
86
+ assert len(sample_submission) == len(
87
+ test_set
88
+ ), f"Expected sample_submission ({len(sample_submission)}) to have the same length as test_set ({len(test_set)})"
89
+ assert set(answers.columns) == set(
90
+ ["id", "is_iceberg"]
91
+ ), "Answers must have 'id' and 'is_iceberg' columns"
92
+ assert set(sample_submission.columns) == set(
93
+ ["id", "is_iceberg"]
94
+ ), "Sample submission must have 'id' and 'is_iceberg' columns"
95
+ train_ids = set([sample["id"] for sample in train_set])
96
+ test_ids = set([sample["id"] for sample in test_set])
97
+ assert train_ids.isdisjoint(test_ids), "Train and test ids should not overlap"
98
+
99
+
100
+ def prepare(raw: Path, public: Path, private: Path):
101
+ """
102
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
103
+ Also creates a secondary validation split (public_val, private_val) for model development.
104
+ """
105
+ extract(raw / "train.json.7z", raw)
106
+ extract(raw / "test.json.7z", raw)
107
+ old_train = json.load((raw / "data/processed/train.json").open())
108
+ old_test = json.load((raw / "data/processed/test.json").open())
109
+
110
+ all_fields = list([key for key in old_train[0].keys()])
111
+ assert all(
112
+ set(all_fields) == set([key for key in sample.keys()]) for sample in old_train
113
+ ), "Inconsistent fields in train set"
114
+ test_fields = list([key for key in old_test[0].keys()])
115
+ assert all(
116
+ set(test_fields) == set([key for key in sample.keys()]) for sample in old_test
117
+ ), "Inconsistent fields in test set"
118
+
119
+ # --- First Split: Create the main train/test sets for the competition ---
120
+ # Old ratio is Train set - 1,604 samples; Test set - 8,424 samples (~84% ratio)
121
+ # We do a 20% ratio to avoid removing too many samples from train
122
+ new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
123
+
124
+ # Generate the original public and private directory files
125
+ _create_split_files(
126
+ train_set=new_train,
127
+ test_set=new_test,
128
+ test_fields=test_fields,
129
+ public_dir=public,
130
+ private_dir=private,
131
+ )
132
+
133
+ # Check that the total number of samples is conserved in the first split
134
+ assert len(new_train) + len(new_test) == len(
135
+ old_train
136
+ ), f"Expected {len(old_train)} total samples in new_train ({len(new_train)}) and new_test ({len(new_test)})"
137
+
138
+ # --- Second Split: Create a validation set from the main training set ---
139
+ # Define paths for the new validation split
140
+ public_val = public.parent / "public_val"
141
+ private_val = private.parent / "private_val"
142
+
143
+ # The goal is for the new validation set (test_val) to have the same size as the original test set (new_test).
144
+ # test_size = len(new_test) / len(new_train) = (0.2 * N) / (0.8 * N) = 0.25
145
+ test_size_for_val_split = len(new_test) / len(new_train)
146
+ train_val, test_val = train_test_split(
147
+ new_train, test_size=test_size_for_val_split, random_state=0
148
+ )
149
+
150
+ # Generate the validation public_val and private_val directory files
151
+ _create_split_files(
152
+ train_set=train_val,
153
+ test_set=test_val,
154
+ test_fields=test_fields,
155
+ public_dir=public_val,
156
+ private_dir=private_val,
157
+ )
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="Cover_Type", id_column="Id"
10
+ )
11
+ return accuracy_score(**accuracy_inputs)
@@ -0,0 +1,39 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.utils import read_csv
7
+
8
+
9
+ def prepare(raw: Path, public: Path, private: Path):
10
+ """
11
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
12
+ """
13
+ # Create train, test from train split
14
+ old_train = read_csv(raw / "train.csv")
15
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
16
+ new_test_without_labels = new_test.drop(columns=["Cover_Type"])
17
+
18
+ new_train.to_csv(public / "train.csv", index=False)
19
+ new_test.to_csv(private / "test.csv", index=False)
20
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
21
+
22
+ # Create a sample submission file
23
+ submission_df = pd.DataFrame(
24
+ {
25
+ "Id": new_test["Id"].values,
26
+ "Cover_Type": 2,
27
+ }
28
+ )
29
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
30
+
31
+ assert len(new_train) + len(new_test) == len(
32
+ old_train
33
+ ), "train and test splits lengths do not add up to original data."
34
+ assert set(new_train["Id"]).isdisjoint(
35
+ set(new_test["Id"])
36
+ ), f"there are Ids in both train and test sets: {set(new_train['Id']).intersection(set(new_test['Id']))}"
37
+ assert len(new_test_without_labels) == len(
38
+ new_test
39
+ ), "public and Private tests should have equal length"
@@ -0,0 +1,99 @@
1
+ from pathlib import Path
2
+
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.utils import read_csv
7
+
8
+
9
+ def _split_and_save(
10
+ df: pd.DataFrame,
11
+ test_size: float,
12
+ public_path: Path,
13
+ private_path: Path,
14
+ random_state: int,
15
+ ):
16
+ """
17
+ Helper function to perform a data split, save files, and run assertions.
18
+
19
+ Args:
20
+ df (pd.DataFrame): The dataframe to split.
21
+ test_size (float): The proportion of the dataset to allocate to the test split.
22
+ public_path (Path): The directory for public-facing files (train set, unlabeled test set).
23
+ private_path (Path): The directory for private-facing files (labeled test set).
24
+ random_state (int): The seed for the random number generator.
25
+
26
+ Returns:
27
+ Tuple[pd.DataFrame, pd.DataFrame]: The resulting train and test dataframes.
28
+ """
29
+ # Ensure output directories exist
30
+ public_path.mkdir(parents=True, exist_ok=True)
31
+ private_path.mkdir(parents=True, exist_ok=True)
32
+
33
+ # Perform the split
34
+ train_df, test_df = train_test_split(
35
+ df, test_size=test_size, random_state=random_state
36
+ )
37
+ test_df_without_labels = test_df.drop(columns=["Cover_Type"])
38
+
39
+ # Save the split data using standard filenames
40
+ train_df.to_csv(public_path / "train.csv", index=False)
41
+ test_df.to_csv(private_path / "test.csv", index=False)
42
+ test_df_without_labels.to_csv(public_path / "test.csv", index=False)
43
+
44
+ # Create a sample submission file
45
+ submission_df = pd.DataFrame(
46
+ {"Id": test_df["Id"].values, "Cover_Type": 2}
47
+ )
48
+ submission_df.to_csv(public_path / "sample_submission.csv", index=False)
49
+
50
+ # Assertions
51
+ assert len(train_df) + len(test_df) == len(
52
+ df
53
+ ), "train and test splits lengths do not add up to original data."
54
+ assert set(train_df["Id"]).isdisjoint(
55
+ set(test_df["Id"])
56
+ ), f"there are Ids in both train and test sets: {set(train_df['Id']).intersection(set(test_df['Id']))}"
57
+ assert len(test_df_without_labels) == len(
58
+ test_df
59
+ ), "public and Private tests should have equal length"
60
+
61
+ return train_df, test_df
62
+
63
+
64
+ def prepare(raw: Path, public: Path, private: Path):
65
+ """
66
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
67
+ Also creates a secondary validation split in public_val/private_val directories.
68
+ """
69
+ # Create train, test from train split
70
+ old_train = read_csv(raw / "train.csv")
71
+
72
+ # --- First Split: Create the original train and test sets ---
73
+ # This call produces the original outputs, which must remain unchanged.
74
+ new_train, new_test = _split_and_save(
75
+ df=old_train,
76
+ test_size=0.1,
77
+ public_path=public,
78
+ private_path=private,
79
+ random_state=0,
80
+ )
81
+
82
+ # --- Second Split: Create a new train and validation set ---
83
+ # Define paths for the new validation set outputs
84
+ public_val = public.parent / "public_val"
85
+ private_val = private.parent / "private_val"
86
+
87
+ # Calculate the test size for the second split to make the new validation
88
+ # set (`test_val`) have the same number of samples as the original test set.
89
+ test_val_size = len(new_test) / len(new_train)
90
+
91
+ # This call takes the `new_train` data and splits it again, saving the
92
+ # results to the new `_val` directories with identical structure and filenames.
93
+ _split_and_save(
94
+ df=new_train,
95
+ test_size=test_val_size,
96
+ public_path=public_val,
97
+ private_path=private_val,
98
+ random_state=0, # Use the same random state for consistency
99
+ )
@@ -0,0 +1,9 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import roc_auc_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_auroc_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ roc_auc_inputs = prepare_for_auroc_metric(submission, answers, id_col="id", target_col="target")
9
+ return roc_auc_score(y_true=roc_auc_inputs["y_true"], y_score=roc_auc_inputs["y_score"])