dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,88 @@
1
+ import random
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from tqdm import tqdm
8
+
9
+
10
+ def prepare(raw: Path, public: Path, private: Path):
11
+ """
12
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
13
+ """
14
+ # List of all train image IDs
15
+ cover_images_dir = raw / "Cover"
16
+ cover_images = sorted(list(cover_images_dir.glob("*.jpg")))
17
+
18
+ # Create train, test from train split
19
+ test_size = 5000
20
+ new_train, new_test = train_test_split(cover_images, test_size=test_size, random_state=42)
21
+
22
+ # Prepare dirs
23
+ steganography_algs = ["JMiPOD", "JUNIWARD", "UERD"]
24
+ for subdir in ["Cover", "Test"] + steganography_algs:
25
+ (public / subdir).mkdir(parents=True, exist_ok=True)
26
+
27
+ # Copy over new train set, giving them new ids
28
+ for idx, fp in tqdm(enumerate(new_train), total=len(new_train), desc="Copying train images"):
29
+ image_id = idx + 1 # 1-indexed
30
+ shutil.copyfile(src=fp, dst=public / "Cover" / f"{image_id:05d}.jpg")
31
+ for alg in steganography_algs:
32
+ shutil.copyfile(src=raw / alg / fp.name, dst=public / alg / f"{image_id:05d}.jpg")
33
+
34
+ # Populate test set
35
+ answers_rows = []
36
+ random.seed(0)
37
+ random.shuffle(new_test)
38
+ for idx, fp in tqdm(enumerate(new_test), total=len(new_test), desc="Copying test images"):
39
+ image_id = idx + 1 # 1-indexed
40
+ test_id = f"{image_id:04d}.jpg"
41
+ dest = public / "Test" / test_id
42
+
43
+ # For the test set, we randomly select between the "Cover" (unedited image, negative class)
44
+ # and one of the 3 steganography algorithms (positive class)
45
+ # 1:1 ratio of positive:negative examples, and even distribution of steganography algorithms
46
+ if random.choice([True, False]):
47
+ # Negative class
48
+ shutil.copyfile(
49
+ src=fp,
50
+ dst=dest,
51
+ )
52
+ answers_rows.append({"Id": test_id, "Label": 0})
53
+ else:
54
+ # Positive class
55
+ alg = random.choice(steganography_algs)
56
+ shutil.copyfile(src=raw / alg / fp.name, dst=dest)
57
+ answers_rows.append({"Id": test_id, "Label": 1})
58
+
59
+ # Write answers to file
60
+ answers_df = pd.DataFrame(answers_rows)
61
+ answers_df.to_csv(private / "test.csv", index=False)
62
+
63
+ # Create sample submission
64
+ sample_submission = answers_df.copy()
65
+ sample_submission["Label"] = 0
66
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
67
+
68
+ # Checks
69
+ assert "Id" in answers_df.columns, "Answers must have 'Id' column"
70
+ assert "Label" in answers_df.columns, "Answers must have 'Label' column"
71
+ assert "Id" in sample_submission.columns, "Sample submission must have 'Id' column"
72
+ assert "Label" in sample_submission.columns, "Sample submission must have 'Label' column"
73
+ assert (
74
+ len(answers_df) == test_size
75
+ ), f"Expected {test_size} test images, but got {len(answers_df)}"
76
+ assert len(sample_submission) == len(
77
+ answers_df
78
+ ), f"Sample submission ({len(sample_submission)}) and answers ({len(answers_df)}) must have the same length"
79
+ assert (
80
+ len(list(public.glob("Test/*.jpg"))) == test_size
81
+ ), f"Expected {test_size} test images in public/Test, but got {len(list(public.glob('Test/*.jpg')))}"
82
+ assert len(list(public.glob("Cover/*.jpg"))) == len(
83
+ new_train
84
+ ), f"Expected {len(new_train)} train images in public/Cover, but got {len(list(public.glob('Cover/*.jpg')))}"
85
+ for alg in steganography_algs:
86
+ assert len(list(public.glob(f"{alg}/*.jpg"))) == len(
87
+ new_train
88
+ ), f"Expected {len(new_train)} train images in public/{alg}, but got {len(list(public.glob(f'{alg}/*.jpg')))}"
@@ -0,0 +1,148 @@
1
+ import random
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from tqdm import tqdm
8
+
9
+
10
+ def _create_dataset_split(
11
+ train_image_paths: list,
12
+ test_image_paths: list,
13
+ raw_dir: Path,
14
+ public_dir: Path,
15
+ private_dir: Path,
16
+ steganography_algs: list,
17
+ ):
18
+ """
19
+ Helper function to create a dataset split (e.g., train/test or train_val/test_val).
20
+
21
+ This function populates the public and private directories with the respective
22
+ training images, test images, and test set ground-truth labels.
23
+ """
24
+ # Prepare dirs
25
+ public_dir.mkdir(parents=True, exist_ok=True)
26
+ private_dir.mkdir(parents=True, exist_ok=True)
27
+ for subdir in ["Cover", "Test"] + steganography_algs:
28
+ (public_dir / subdir).mkdir(exist_ok=True)
29
+
30
+ # Copy over the train set for this split, giving them new ids
31
+ for idx, fp in tqdm(
32
+ enumerate(train_image_paths), total=len(train_image_paths), desc=f"Copying train images to {public_dir.name}"
33
+ ):
34
+ image_id = idx + 1 # 1-indexed
35
+ shutil.copyfile(src=fp, dst=public_dir / "Cover" / f"{image_id:05d}.jpg")
36
+ for alg in steganography_algs:
37
+ shutil.copyfile(src=raw_dir / alg / fp.name, dst=public_dir / alg / f"{image_id:05d}.jpg")
38
+
39
+ # Populate the test set for this split
40
+ answers_rows = []
41
+ random.seed(0) # Reset seed for deterministic test set creation
42
+ random.shuffle(test_image_paths)
43
+ for idx, fp in tqdm(
44
+ enumerate(test_image_paths), total=len(test_image_paths), desc=f"Copying test images to {public_dir.name}"
45
+ ):
46
+ image_id = idx + 1 # 1-indexed
47
+ test_id = f"{image_id:04d}.jpg"
48
+ dest = public_dir / "Test" / test_id
49
+
50
+ # For the test set, we randomly select between the "Cover" (unedited image, negative class)
51
+ # and one of the 3 steganography algorithms (positive class)
52
+ # 1:1 ratio of positive:negative examples, and even distribution of steganography algorithms
53
+ if random.choice([True, False]):
54
+ # Negative class
55
+ shutil.copyfile(
56
+ src=fp,
57
+ dst=dest,
58
+ )
59
+ answers_rows.append({"Id": test_id, "Label": 0})
60
+ else:
61
+ # Positive class
62
+ alg = random.choice(steganography_algs)
63
+ shutil.copyfile(src=raw_dir / alg / fp.name, dst=dest)
64
+ answers_rows.append({"Id": test_id, "Label": 1})
65
+
66
+ # Write answers to file
67
+ answers_df = pd.DataFrame(answers_rows)
68
+ answers_df.to_csv(private_dir / "test.csv", index=False)
69
+
70
+ # Create sample submission
71
+ sample_submission = answers_df.copy()
72
+ sample_submission["Label"] = 0
73
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
74
+
75
+ # Checks
76
+ test_size = len(test_image_paths)
77
+ assert "Id" in answers_df.columns, "Answers must have 'Id' column"
78
+ assert "Label" in answers_df.columns, "Answers must have 'Label' column"
79
+ assert "Id" in sample_submission.columns, "Sample submission must have 'Id' column"
80
+ assert "Label" in sample_submission.columns, "Sample submission must have 'Label' column"
81
+ assert (
82
+ len(answers_df) == test_size
83
+ ), f"Expected {test_size} test images, but got {len(answers_df)}"
84
+ assert len(sample_submission) == len(
85
+ answers_df
86
+ ), f"Sample submission ({len(sample_submission)}) and answers ({len(answers_df)}) must have the same length"
87
+ assert (
88
+ len(list(public_dir.glob("Test/*.jpg"))) == test_size
89
+ ), f"Expected {test_size} test images in {public_dir.name}/Test, but got {len(list(public_dir.glob('Test/*.jpg')))}"
90
+ assert len(list(public_dir.glob("Cover/*.jpg"))) == len(
91
+ train_image_paths
92
+ ), f"Expected {len(train_image_paths)} train images in {public_dir.name}/Cover, but got {len(list(public_dir.glob('Cover/*.jpg')))}"
93
+ for alg in steganography_algs:
94
+ assert len(list(public_dir.glob(f"{alg}/*.jpg"))) == len(
95
+ train_image_paths
96
+ ), f"Expected {len(train_image_paths)} train images in {public_dir.name}/{alg}, but got {len(list(public_dir.glob(f'{alg}/*.jpg')))}"
97
+
98
+
99
+ def prepare(raw: Path, public: Path, private: Path):
100
+ """
101
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
102
+ Also creates a secondary validation split (train_val/test_val) in parallel directories.
103
+ """
104
+ # List of all train image IDs
105
+ cover_images_dir = raw / "Cover"
106
+ cover_images = sorted(list(cover_images_dir.glob("*.jpg")))
107
+ steganography_algs = ["JMiPOD", "JUNIWARD", "UERD"]
108
+ test_size = 5000
109
+
110
+ # --- Stage 1: Create the main competition train/test split ---
111
+ # This split creates the final test set used for scoring.
112
+ # The outputs in `public` and `private` are left untouched by subsequent steps.
113
+ train_main, test_main = train_test_split(
114
+ cover_images, test_size=test_size, random_state=42
115
+ )
116
+
117
+ _create_dataset_split(
118
+ train_image_paths=train_main,
119
+ test_image_paths=test_main,
120
+ raw_dir=raw,
121
+ public_dir=public,
122
+ private_dir=private,
123
+ steganography_algs=steganography_algs,
124
+ )
125
+
126
+ # --- Stage 2: Create the validation train/test split ---
127
+ # This performs a second split on the main training data (`train_main`)
128
+ # to create a new, smaller training set and a validation set.
129
+ # Outputs are saved to `public_val` and `private_val` to avoid conflicts.
130
+ public_val = public.parent / "public_val"
131
+ private_val = private.parent / "private_val"
132
+
133
+ # Split `train_main` again, using the same logic as the first split
134
+ # to create a validation set of the same size as the main test set.
135
+ train_val, test_val = train_test_split(
136
+ train_main, test_size=test_size, random_state=42
137
+ )
138
+
139
+ # The new split is processed using the same helper function to ensure
140
+ # identical directory structure, filenames, and creation logic.
141
+ _create_dataset_split(
142
+ train_image_paths=train_val,
143
+ test_image_paths=test_val,
144
+ raw_dir=raw,
145
+ public_dir=public_val,
146
+ private_dir=private_val,
147
+ steganography_algs=steganography_algs,
148
+ )
@@ -0,0 +1,35 @@
1
+ from pandas import DataFrame
2
+ from sklearn.metrics import cohen_kappa_score
3
+
4
+ from mlebench.grade_helpers import InvalidSubmissionError
5
+
6
+
7
+ def prepare_for_grading(submission: DataFrame, answers: DataFrame, target_column: str) -> tuple:
8
+
9
+ if target_column not in submission.columns:
10
+ raise InvalidSubmissionError(f"Submission must contain the target column '{target_column}'")
11
+ assert (
12
+ target_column in answers.columns
13
+ ), f"Answers must contain the target column '{target_column}'"
14
+ if submission.empty:
15
+ raise InvalidSubmissionError("Submission DataFrame should not be empty")
16
+ assert not answers.empty, "Answers DataFrame should not be empty"
17
+ if set(submission["id_code"]) != set(answers["id_code"]):
18
+ raise InvalidSubmissionError("Submission must have the same id_codes as answers")
19
+
20
+ # Ensure the submissions and answers are sorted by id_code
21
+ submission_sorted = submission.sort_values(by="id_code")
22
+ answers_sorted = answers.sort_values(by="id_code")
23
+
24
+ # Extract the target columns
25
+ y_pred = submission_sorted[target_column].values
26
+ y_true = answers_sorted[target_column].values
27
+
28
+ return y_pred, y_true
29
+
30
+
31
+ def grade(submission: DataFrame, answers: DataFrame) -> float:
32
+ y_pred, y_true = prepare_for_grading(
33
+ submission=submission, answers=answers, target_column="diagnosis"
34
+ )
35
+ return cohen_kappa_score(y_pred, y_true, weights="quadratic")
@@ -0,0 +1,75 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ from mlebench.utils import read_csv
7
+
8
+
9
+ def prepare(raw: Path, public: Path, private: Path):
10
+ """
11
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
12
+ """
13
+
14
+ # Create train, test from train split
15
+ old_train = read_csv(raw / "train.csv")
16
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
17
+ new_test_without_labels = new_test.drop(columns=["diagnosis"])
18
+
19
+ (public / "test_images").mkdir(exist_ok=True)
20
+ (public / "train_images").mkdir(exist_ok=True)
21
+
22
+ # Copy data
23
+ for file_id in new_train["id_code"]:
24
+ shutil.copyfile(
25
+ src=raw / "train_images" / f"{file_id}.png",
26
+ dst=public / "train_images" / f"{file_id}.png",
27
+ )
28
+
29
+ for file_id in new_test_without_labels["id_code"]:
30
+ shutil.copyfile(
31
+ src=raw / "train_images" / f"{file_id}.png",
32
+ dst=public / "test_images" / f"{file_id}.png",
33
+ )
34
+
35
+ # Check integrity of the files copied
36
+ assert set(new_train["id_code"]).isdisjoint(
37
+ set(new_test["id_code"])
38
+ ), "Train and test sets should have no shared ids"
39
+
40
+ assert len(new_test_without_labels) == len(
41
+ new_test
42
+ ), "Public and Private tests should have equal length"
43
+
44
+ assert len(list(public.glob("train_images/*.png"))) == len(
45
+ new_train
46
+ ), "Public train images should have the same number of images as the length of train set"
47
+
48
+ assert len(list(public.glob("test_images/*.png"))) == len(
49
+ new_test_without_labels
50
+ ), "Public test images should have the same number of images as the length of test set"
51
+
52
+ train_image_files = set(public.glob("train_images/*.png"))
53
+ test_image_files = set(public.glob("test_images/*.png"))
54
+ common_files = train_image_files.intersection(test_image_files)
55
+ assert not common_files, f"Images found in both train_images and test_images: {common_files}"
56
+
57
+ for file_id in new_test["id_code"]:
58
+ assert (
59
+ public / "test_images" / f"{file_id}.png"
60
+ ).exists(), f"Image file for {file_id} not found in test_images"
61
+
62
+ for file_id in new_train["id_code"]:
63
+ assert (
64
+ public / "train_images" / f"{file_id}.png"
65
+ ).exists(), f"Image file for {file_id} not found in train_images"
66
+
67
+ # Create a sample submission file
68
+ submission_df = new_test.copy()
69
+ submission_df["diagnosis"] = 0
70
+
71
+ # Write CSVs
72
+ new_train.to_csv(public / "train.csv", index=False)
73
+ new_test.to_csv(private / "test.csv", index=False)
74
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
75
+ submission_df.to_csv(public / "sample_submission.csv", index=False)
@@ -0,0 +1,123 @@
1
+ import shutil
2
+ from pathlib import Path
3
+ import pandas as pd
4
+
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ from mlebench.utils import read_csv
8
+
9
+
10
+ def _process_and_save_split(
11
+ train_df: pd.DataFrame,
12
+ test_df: pd.DataFrame,
13
+ source_images_path: Path,
14
+ public_path: Path,
15
+ private_path: Path,
16
+ ):
17
+ """
18
+ A helper function to process a single split. It handles directory creation,
19
+ file copying, integrity checks, and writing output CSVs.
20
+ """
21
+ # Create output directories
22
+ public_path.mkdir(exist_ok=True, parents=True)
23
+ private_path.mkdir(exist_ok=True, parents=True)
24
+ (public_path / "test_images").mkdir(exist_ok=True)
25
+ (public_path / "train_images").mkdir(exist_ok=True)
26
+
27
+ test_df_without_labels = test_df.drop(columns=["diagnosis"])
28
+
29
+ # Copy data for the current split
30
+ for file_id in train_df["id_code"]:
31
+ shutil.copyfile(
32
+ src=source_images_path / f"{file_id}.png",
33
+ dst=public_path / "train_images" / f"{file_id}.png",
34
+ )
35
+
36
+ for file_id in test_df_without_labels["id_code"]:
37
+ shutil.copyfile(
38
+ src=source_images_path / f"{file_id}.png",
39
+ dst=public_path / "test_images" / f"{file_id}.png",
40
+ )
41
+
42
+ # Check integrity of the files copied for the current split
43
+ assert set(train_df["id_code"]).isdisjoint(
44
+ set(test_df["id_code"])
45
+ ), "Train and test sets should have no shared ids"
46
+
47
+ assert len(test_df_without_labels) == len(
48
+ test_df
49
+ ), "Public and Private tests should have equal length"
50
+
51
+ assert len(list(public_path.glob("train_images/*.png"))) == len(
52
+ train_df
53
+ ), "Public train images should have the same number of images as the length of train set"
54
+
55
+ assert len(list(public_path.glob("test_images/*.png"))) == len(
56
+ test_df_without_labels
57
+ ), "Public test images should have the same number of images as the length of test set"
58
+
59
+ train_image_files = set(public_path.glob("train_images/*.png"))
60
+ test_image_files = set(public_path.glob("test_images/*.png"))
61
+ common_files = train_image_files.intersection(test_image_files)
62
+ assert not common_files, f"Images found in both train_images and test_images: {common_files}"
63
+
64
+ for file_id in test_df["id_code"]:
65
+ assert (
66
+ public_path / "test_images" / f"{file_id}.png"
67
+ ).exists(), f"Image file for {file_id} not found in test_images"
68
+
69
+ for file_id in train_df["id_code"]:
70
+ assert (
71
+ public_path / "train_images" / f"{file_id}.png"
72
+ ).exists(), f"Image file for {file_id} not found in train_images"
73
+
74
+ # Create a sample submission file
75
+ submission_df = test_df.copy()
76
+ submission_df["diagnosis"] = 0
77
+
78
+ # Write CSVs for the current split
79
+ train_df.to_csv(public_path / "train.csv", index=False)
80
+ test_df.to_csv(private_path / "test.csv", index=False)
81
+ test_df_without_labels.to_csv(public_path / "test.csv", index=False)
82
+ submission_df.to_csv(public_path / "sample_submission.csv", index=False)
83
+
84
+
85
+ def prepare(raw: Path, public: Path, private: Path):
86
+ """
87
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
88
+ It also creates a secondary validation split in public_val/private_val directories.
89
+ """
90
+
91
+ # --- Stage 1: Create the original train/test split ---
92
+ # This section remains functionally identical to the original script
93
+ # to ensure the contents of `public` and `private` are unchanged.
94
+ old_train = read_csv(raw / "train.csv")
95
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
96
+
97
+ # Process and save the original split using the helper function
98
+ _process_and_save_split(
99
+ train_df=new_train,
100
+ test_df=new_test,
101
+ source_images_path=raw / "train_images",
102
+ public_path=public,
103
+ private_path=private,
104
+ )
105
+
106
+ # --- Stage 2: Create the new train/validation split ---
107
+ # This split takes the training set from Stage 1 (`new_train`) and splits it again.
108
+ public_val = public.parent / "public_val"
109
+ private_val = private.parent / "private_val"
110
+
111
+ # The test size is 1/9 of the `new_train` data, which is equivalent to 10% of the
112
+ # original total data. This makes the new validation set (`test_val`) have the
113
+ # same size as the original test set (`new_test`).
114
+ train_val, test_val = train_test_split(new_train, test_size=1 / 9, random_state=0)
115
+
116
+ # Process and save the new validation split into the _val directories
117
+ _process_and_save_split(
118
+ train_df=train_val,
119
+ test_df=test_val,
120
+ source_images_path=raw / "train_images",
121
+ public_path=public_val,
122
+ private_path=private_val,
123
+ )
File without changes
@@ -0,0 +1,55 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import mean_squared_log_error
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
9
+ """Prepare submission and answers for RMSLE calculation."""
10
+ if len(submission) != len(answers):
11
+ raise InvalidSubmissionError(
12
+ f"Submission length ({len(submission)}) != answers length ({len(answers)})"
13
+ )
14
+
15
+ if "datetime" not in submission.columns:
16
+ raise InvalidSubmissionError("Submission must have a 'datetime' column")
17
+
18
+ if "count" not in submission.columns:
19
+ raise InvalidSubmissionError("Submission must have a 'count' column")
20
+
21
+ # Sort by datetime
22
+ submission = submission.sort_values("datetime").reset_index(drop=True)
23
+ answers = answers.sort_values("datetime").reset_index(drop=True)
24
+
25
+ # Check datetime alignment
26
+ if not (submission["datetime"] == answers["datetime"]).all():
27
+ raise InvalidSubmissionError("Submission and answers datetime mismatch")
28
+
29
+ # Extract predictions and true values
30
+ y_pred = submission["count"].values
31
+ y_true = answers["count"].values
32
+
33
+ # Validate predictions
34
+ try:
35
+ y_pred = y_pred.astype(float)
36
+ except (ValueError, TypeError):
37
+ raise InvalidSubmissionError("Predictions must be numeric")
38
+
39
+ if np.any(np.isnan(y_pred)):
40
+ raise InvalidSubmissionError("Predictions cannot contain NaN values")
41
+
42
+ if np.any(y_pred < 0):
43
+ raise InvalidSubmissionError("Predictions cannot be negative")
44
+
45
+ return y_true, y_pred
46
+
47
+
48
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
49
+ """Calculate RMSLE score."""
50
+ y_true, y_pred = prepare_for_metric(submission, answers)
51
+
52
+ # RMSLE = sqrt(MSLE)
53
+ rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
54
+
55
+ return rmsle
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ import pandas as pd
3
+
4
+
5
+ def prepare(raw: Path, public: Path, private: Path):
6
+ """
7
+ Prepare bike-sharing-demand dataset.
8
+
9
+ Raw data already contains:
10
+ - train.csv: training data with labels
11
+ - test.csv: test data without labels
12
+ - sampleSubmission.csv: sample submission format
13
+ - test_answer.csv: test labels (private)
14
+ """
15
+ # Read data
16
+ train = pd.read_csv(raw / "train.csv")
17
+ test = pd.read_csv(raw / "test.csv")
18
+ sample_submission = pd.read_csv(raw / "sampleSubmission.csv")
19
+ test_answer = pd.read_csv(raw / "test_answer.csv")
20
+
21
+ # Public files (visible to agents)
22
+ train.to_csv(public / "train.csv", index=False)
23
+ test.to_csv(public / "test.csv", index=False)
24
+ sample_submission.to_csv(public / "sampleSubmission.csv", index=False)
25
+
26
+ # Private files (for grading)
27
+ test_answer.to_csv(private / "test.csv", index=False)
28
+
29
+ # Validation checks
30
+ assert len(test_answer) == len(test), \
31
+ f"Test answer ({len(test_answer)}) and test ({len(test)}) must have same length"
32
+ assert len(sample_submission) == len(test), \
33
+ f"Sample submission ({len(sample_submission)}) and test ({len(test)}) must have same length"
34
+ assert "datetime" in test_answer.columns, "Test answer must have 'datetime' column"
35
+ assert "count" in test_answer.columns, "Test answer must have 'count' column"
36
+ assert "datetime" in sample_submission.columns, "Sample submission must have 'datetime' column"
37
+ assert "count" in sample_submission.columns, "Sample submission must have 'count' column"
@@ -0,0 +1,37 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from Levenshtein import distance
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> None:
9
+ id_col = "id"
10
+ target_col = "sentence"
11
+
12
+ assert id_col in answers.columns, f"Missing {id_col} column in answers"
13
+ assert target_col in answers.columns, f"Missing {target_col} column in answers"
14
+
15
+ if id_col not in submission.columns:
16
+ raise InvalidSubmissionError(f"Missing {id_col} column in submission")
17
+ if target_col not in submission.columns:
18
+ raise InvalidSubmissionError(f"Missing {target_col} column in submission")
19
+
20
+ if len(submission) != len(answers):
21
+ raise InvalidSubmissionError(
22
+ f"Submission and answers have different lengths: {len(submission)} vs {len(answers)}"
23
+ )
24
+
25
+ submission = submission.sort_values(id_col).reset_index(drop=True)
26
+ answers = answers.sort_values(id_col).reset_index(drop=True)
27
+
28
+ if (submission[id_col].values != answers[id_col].values).any():
29
+ raise InvalidSubmissionError("Submission and answers should contain the same IDs")
30
+
31
+ return submission[target_col], answers[target_col]
32
+
33
+
34
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
35
+ y_pred, y_true = prepare_for_metric(submission, answers)
36
+ mean_levenshtein = np.mean([distance(a, b) for a, b in zip(y_true, y_pred)])
37
+ return mean_levenshtein