dslighting 1.7.1__py3-none-any.whl → 1.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/METADATA +1 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,107 @@
1
+ import os
2
+ import shutil
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ from tqdm.auto import tqdm
8
+
9
+ from mlebench.utils import extract, get_logger
10
+
11
+ np_rng = np.random.RandomState(0)
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def count_lines_in_file(file_path):
17
+ line_count = 0
18
+ with open(file_path, "r") as file:
19
+ for _line in file:
20
+ line_count += 1
21
+ return line_count
22
+
23
+
24
+ def compress_file_to_zip(src_file: Path, zip_file: Path):
25
+ with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
26
+ zipf.write(src_file, arcname=src_file.name)
27
+
28
+
29
+ def remove_random_word(sentence: str) -> str:
30
+ """
31
+ Remove a random 'word' (sequence of characters, delimited by whitespace) from a sentence.
32
+ Does not remove first or last words.
33
+
34
+ Punctuation counts as a word, and is already separated by whitespace.
35
+ """
36
+ words = sentence.split()
37
+ index = np_rng.randint(1, len(words) - 1)
38
+ return " ".join(words[:index] + words[index + 1 :])
39
+
40
+
41
+ def prepare(raw: Path, public: Path, private: Path):
42
+ logger.info("Extracting raw / train_v2.txt.zip")
43
+ extract(raw / "train_v2.txt.zip", raw)
44
+
45
+ # computed this ahead of time
46
+ total_lines = 30301028
47
+
48
+ with (
49
+ open(raw / "train_v2.txt", "r") as old_train,
50
+ open(public / "train_v2.txt", "w") as public_train,
51
+ open(public / "test_v2.txt", "w") as public_test,
52
+ open(private / "test.csv", "w") as private_test,
53
+ ):
54
+ public_test.write('"id","sentence"\n')
55
+ private_test.write('"id","sentence"\n')
56
+ line_count = 0
57
+ test_count = 0
58
+ train_count = 0
59
+ # there is one sentence per line
60
+ for sentence in tqdm(old_train, desc="Processing data", total=total_lines):
61
+ # we will put ~0.01 of the data in test, the rest in train, matching kaggle's original split
62
+ # some sentences only have 2 words, so can't remove a word -- keep them in train
63
+ if np_rng.uniform() <= 0.01 and len(sentence.strip().split()) > 2:
64
+ # get rid of linebreak and escape quotes
65
+ sentence = sentence.strip().replace('"', '""')
66
+ removed_word_sentence = remove_random_word(sentence)
67
+ private_test.write(f'{test_count},"{sentence}"\n')
68
+ public_test.write(f'{test_count},"{removed_word_sentence}"\n')
69
+ test_count += 1
70
+ else:
71
+ public_train.write(sentence)
72
+ train_count += 1
73
+ line_count += 1
74
+ if line_count >= total_lines:
75
+ break
76
+
77
+ # we will be compressing the public files (to match what's on kaggle.com)
78
+ # so copy our sample submission to private so we have access to it
79
+ shutil.copy(public / "test_v2.txt", private / "sample_submission.csv")
80
+
81
+ # compress the public files
82
+ logger.info("Compressing train_v2.txt")
83
+ compress_file_to_zip(public / "train_v2.txt", public / "train_v2.txt.zip")
84
+ logger.info("Compressing test_v2.txt")
85
+ compress_file_to_zip(public / "test_v2.txt", public / "test_v2.txt.zip")
86
+ # remove the original files
87
+ (public / "train_v2.txt").unlink()
88
+ (public / "test_v2.txt").unlink()
89
+
90
+ # Checks
91
+ assert not (public / "train_v2.txt").exists(), "public / 'train_v2.txt' should not exist"
92
+ assert (public / "train_v2.txt.zip").exists(), "public / 'train_v2.txt.zip' should exist"
93
+ assert not (public / "test_v2.txt").exists(), "public / 'test_v2.txt' should not exist"
94
+ assert (public / "test_v2.txt.zip").exists(), "public / 'test_v2.txt.zip' should exist"
95
+
96
+ private_test_line_count = count_lines_in_file(private / "test.csv")
97
+ assert (
98
+ # minus 2 to exclude header
99
+ private_test_line_count - 1
100
+ == test_count
101
+ ), "private / 'test.csv' has incorrect number of lines"
102
+ assert (
103
+ count_lines_in_file(private / "sample_submission.csv") == private_test_line_count
104
+ ), "private / 'sample_submission.csv' has incorrect number of lines"
105
+ assert (
106
+ test_count + train_count == total_lines
107
+ ), "Expected the number of test samples and train samples to sum to the total number of samples in the original train file"
@@ -0,0 +1,179 @@
1
+ import os
2
+ import shutil
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ from tqdm.auto import tqdm
8
+
9
+ from mlebench.utils import extract, get_logger
10
+
11
+ np_rng = np.random.RandomState(0)
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def count_lines_in_file(file_path):
17
+ line_count = 0
18
+ with open(file_path, "r", encoding="utf-8") as file:
19
+ for _line in file:
20
+ line_count += 1
21
+ return line_count
22
+
23
+
24
+ def compress_file_to_zip(src_file: Path, zip_file: Path):
25
+ with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
26
+ zipf.write(src_file, arcname=src_file.name)
27
+
28
+
29
+ def remove_random_word(sentence: str) -> str:
30
+ """
31
+ Remove a random 'word' (sequence of characters, delimited by whitespace) from a sentence.
32
+ Does not remove first or last words.
33
+
34
+ Punctuation counts as a word, and is already separated by whitespace.
35
+ """
36
+ words = sentence.split()
37
+ index = np_rng.randint(1, len(words) - 1)
38
+ return " ".join(words[:index] + words[index + 1 :])
39
+
40
+
41
+ def _split_and_process_data(
42
+ input_file: Path,
43
+ output_train_file: Path,
44
+ output_public_test_file: Path,
45
+ output_private_test_file: Path,
46
+ total_lines: int,
47
+ ) -> tuple[int, int]:
48
+ """
49
+ Helper function to perform the core data splitting and processing logic.
50
+
51
+ Reads from an input file and splits it into a train and test set based on a
52
+ probabilistic condition, writing them to the specified output files.
53
+ """
54
+ with (
55
+ open(input_file, "r", encoding="utf-8") as old_train,
56
+ open(output_train_file, "w", encoding="utf-8") as public_train,
57
+ open(output_public_test_file, "w", encoding="utf-8") as public_test,
58
+ open(output_private_test_file, "w", encoding="utf-8") as private_test,
59
+ ):
60
+ public_test.write('"id","sentence"\n')
61
+ private_test.write('"id","sentence"\n')
62
+ test_count = 0
63
+ train_count = 0
64
+ # there is one sentence per line
65
+ for sentence in tqdm(old_train, desc=f"Processing {input_file.name}", total=total_lines):
66
+ # we will put ~0.01 of the data in test, the rest in train, matching kaggle's original split
67
+ # some sentences only have 2 words, so can't remove a word -- keep them in train
68
+ if np_rng.uniform() <= 0.01 and len(sentence.strip().split()) > 2:
69
+ # get rid of linebreak and escape quotes
70
+ sentence_clean = sentence.strip().replace('"', '""')
71
+ removed_word_sentence = remove_random_word(sentence_clean)
72
+ private_test.write(f'{test_count},"{sentence_clean}"\n')
73
+ public_test.write(f'{test_count},"{removed_word_sentence}"\n')
74
+ test_count += 1
75
+ else:
76
+ public_train.write(sentence)
77
+ train_count += 1
78
+ return train_count, test_count
79
+
80
+
81
+ def prepare(raw: Path, public: Path, private: Path):
82
+ logger.info("Extracting raw / train_v2.txt.zip")
83
+ extract(raw / "train_v2.txt.zip", raw)
84
+
85
+ # Define and create the new validation directories
86
+ public_val = public.parent / "public_val"
87
+ private_val = private.parent / "private_val"
88
+ public_val.mkdir(exist_ok=True, parents=True)
89
+ private_val.mkdir(exist_ok=True, parents=True)
90
+
91
+ # --- 1. Original Split (raw -> train/test) ---
92
+ logger.info("--- Generating original train/test split ---")
93
+ # computed this ahead of time
94
+ total_lines = 30301028
95
+ original_train_count, original_test_count = _split_and_process_data(
96
+ input_file=raw / "train_v2.txt",
97
+ output_train_file=public / "train_v2.txt",
98
+ output_public_test_file=public / "test_v2.txt",
99
+ output_private_test_file=private / "test.csv",
100
+ total_lines=total_lines,
101
+ )
102
+ assert (
103
+ original_train_count + original_test_count == total_lines
104
+ ), "Sum of train and test samples must equal total samples for original split."
105
+
106
+ # --- 2. Second Split (train -> train_val/test_val) ---
107
+ logger.info("--- Generating validation split from the new training set ---")
108
+ # The input for the second split is the training set from the first split.
109
+ val_split_input_file = public / "train_v2.txt"
110
+ val_split_total_lines = original_train_count
111
+ val_train_count, val_test_count = _split_and_process_data(
112
+ input_file=val_split_input_file,
113
+ output_train_file=public_val / "train_v2.txt",
114
+ output_public_test_file=public_val / "test_v2.txt",
115
+ output_private_test_file=private_val / "test.csv",
116
+ total_lines=val_split_total_lines,
117
+ )
118
+ assert (
119
+ val_train_count + val_test_count == val_split_total_lines
120
+ ), "Sum of train_val and test_val samples must equal total samples for validation split."
121
+
122
+ # --- 3. Process and Compress Original public/private directories ---
123
+ logger.info("--- Compressing and cleaning up original public/private directories ---")
124
+ # we will be compressing the public files (to match what's on kaggle.com)
125
+ # so copy our sample submission to private so we have access to it
126
+ shutil.copy(public / "test_v2.txt", private / "sample_submission.csv")
127
+
128
+ # compress the public files
129
+ logger.info("Compressing train_v2.txt")
130
+ compress_file_to_zip(public / "train_v2.txt", public / "train_v2.txt.zip")
131
+ logger.info("Compressing test_v2.txt")
132
+ compress_file_to_zip(public / "test_v2.txt", public / "test_v2.txt.zip")
133
+ # remove the original files
134
+ (public / "train_v2.txt").unlink()
135
+ (public / "test_v2.txt").unlink()
136
+
137
+ # --- 4. Process and Compress New public_val/private_val directories ---
138
+ logger.info("--- Compressing and cleaning up validation public_val/private_val directories ---")
139
+ # Replicate the process for the validation set
140
+ shutil.copy(public_val / "test_v2.txt", private_val / "sample_submission.csv")
141
+
142
+ # compress the public_val files
143
+ logger.info("Compressing validation train_v2.txt")
144
+ compress_file_to_zip(public_val / "train_v2.txt", public_val / "train_v2.txt.zip")
145
+ logger.info("Compressing validation test_v2.txt")
146
+ compress_file_to_zip(public_val / "test_v2.txt", public_val / "test_v2.txt.zip")
147
+ # remove the original files
148
+ (public_val / "train_v2.txt").unlink()
149
+ (public_val / "test_v2.txt").unlink()
150
+
151
+ # --- 5. Final Checks ---
152
+ logger.info("--- Running final checks ---")
153
+ # Original Checks
154
+ assert not (public / "train_v2.txt").exists(), "public / 'train_v2.txt' should not exist"
155
+ assert (public / "train_v2.txt.zip").exists(), "public / 'train_v2.txt.zip' should exist"
156
+ assert not (public / "test_v2.txt").exists(), "public / 'test_v2.txt' should not exist"
157
+ assert (public / "test_v2.txt.zip").exists(), "public / 'test_v2.txt.zip' should exist"
158
+
159
+ private_test_line_count = count_lines_in_file(private / "test.csv")
160
+ assert (
161
+ private_test_line_count - 1 == original_test_count
162
+ ), "private / 'test.csv' has incorrect number of lines"
163
+ assert (
164
+ count_lines_in_file(private / "sample_submission.csv") == private_test_line_count
165
+ ), "private / 'sample_submission.csv' has incorrect number of lines"
166
+
167
+ # New Checks for Validation Set
168
+ assert not (public_val / "train_v2.txt").exists(), "public_val / 'train_v2.txt' should not exist"
169
+ assert (public_val / "train_v2.txt.zip").exists(), "public_val / 'train_v2.txt.zip' should exist"
170
+ assert not (public_val / "test_v2.txt").exists(), "public_val / 'test_v2.txt' should not exist"
171
+ assert (public_val / "test_v2.txt.zip").exists(), "public_val / 'test_v2.txt.zip' should exist"
172
+
173
+ private_val_test_line_count = count_lines_in_file(private_val / "test.csv")
174
+ assert (
175
+ private_val_test_line_count - 1 == val_test_count
176
+ ), "private_val / 'test.csv' has incorrect number of lines"
177
+ assert (
178
+ count_lines_in_file(private_val / "sample_submission.csv") == private_val_test_line_count
179
+ ), "private_val / 'sample_submission.csv' has incorrect number of lines"
@@ -0,0 +1,40 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from Levenshtein import distance
4
+
5
+ from mlebench.grade_helpers import InvalidSubmissionError
6
+
7
+
8
+ def edit_distance_array(y_true: pd.Series, y_pred: pd.Series) -> float:
9
+ return np.mean([distance(a, b) for a, b in zip(y_true, y_pred)])
10
+
11
+
12
+ def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame):
13
+ if len(submission) != len(answers):
14
+ raise InvalidSubmissionError("Submission and answers must have the same length")
15
+
16
+ if "image_id" not in submission.columns:
17
+ raise InvalidSubmissionError("Submission must have an 'image_id' column")
18
+
19
+ if "InChI" not in submission.columns:
20
+ raise InvalidSubmissionError("Submission must have a 'InChI' column")
21
+
22
+ assert "image_id" in answers.columns, "Answers must have 'image_id' column"
23
+ assert "InChI" in answers.columns, "Answers must have 'InChI' column"
24
+
25
+ submission = submission.sort_values("image_id")
26
+ answers = answers.sort_values("image_id")
27
+
28
+ if (submission["image_id"].values != answers["image_id"].values).any():
29
+ raise InvalidSubmissionError("Submission and answers must have the same ids")
30
+
31
+ # Convert to strings
32
+ submission["InChI"] = submission["InChI"].astype(str)
33
+ answers["InChI"] = answers["InChI"].astype(str)
34
+
35
+ return answers["InChI"], submission["InChI"]
36
+
37
+
38
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
39
+ y_true, y_pred = prepare_for_metric(submission, answers)
40
+ return edit_distance_array(y_true, y_pred)
@@ -0,0 +1,68 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+
9
+ def make_image_subpath(image_id: str) -> Path:
10
+ """
11
+ Creates a triple-nested directory structure from the first 3 characters of the image_id.
12
+ """
13
+ subpath = Path(image_id[0]) / image_id[1] / image_id[2] / f"{image_id}.png"
14
+ return subpath
15
+
16
+
17
+ def prepare(raw: Path, public: Path, private: Path):
18
+ """
19
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
20
+ """
21
+ # Load train data
22
+ old_train = pd.read_csv(raw / "train_labels.csv")
23
+
24
+ # Create train, test from train split
25
+ new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
26
+ new_train.to_csv(public / "train_labels.csv", index=False)
27
+ new_test.to_csv(private / "test.csv", index=False)
28
+
29
+ # Copy train files
30
+ for idx, row in tqdm(new_train.iterrows(), total=len(new_train), desc="Copying train images"):
31
+ image_id = row["image_id"]
32
+ src = raw / "train" / make_image_subpath(image_id)
33
+ dst = public / "train" / make_image_subpath(image_id)
34
+ dst.parent.mkdir(parents=True, exist_ok=True)
35
+ shutil.copyfile(src=src, dst=dst)
36
+
37
+ # Copy test files
38
+ for idx, row in tqdm(new_test.iterrows(), total=len(new_test), desc="Copying test images"):
39
+ image_id = row["image_id"]
40
+ src = raw / "train" / make_image_subpath(image_id)
41
+ dst = public / "test" / make_image_subpath(image_id)
42
+ dst.parent.mkdir(parents=True, exist_ok=True)
43
+ shutil.copyfile(src=src, dst=dst)
44
+
45
+ # Create sample submission
46
+ sample_submission = new_test.copy()
47
+ sample_submission["InChI"] = "InChI=1S/H2O/h1H2"
48
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
49
+
50
+ # Copy other files in the dataset (no modification needed)
51
+ shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public / "extra_approved_InChIs.csv")
52
+
53
+ # Checks
54
+ assert len(new_train) + len(new_test) == len(
55
+ old_train
56
+ ), f"Expected {len(old_train)} total images in new_train ({len(new_train)}) and new_test ({len(new_test)})"
57
+ assert len(list((public / "train").glob("**/*.png"))) == len(
58
+ new_train
59
+ ), f"Expected {len(new_train)} train images in public/train, but got {len(list((public / 'train').glob('**/*.png')))}"
60
+ assert len(list((public / "test").glob("**/*.png"))) == len(
61
+ new_test
62
+ ), f"Expected {len(new_test)} test images in public/test, but got {len(list((public / 'test').glob('**/*.png')))}"
63
+
64
+ assert "image_id" in sample_submission.columns, "Sample submission must have 'image_id' column"
65
+ assert "InChI" in sample_submission.columns, "Sample submission must have 'InChI' column"
66
+ assert len(sample_submission) == len(
67
+ new_test
68
+ ), f"Expected {len(new_test)} images in sample submission, but got {len(sample_submission)}"
@@ -0,0 +1,131 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm import tqdm
7
+
8
+
9
+ def make_image_subpath(image_id: str) -> Path:
10
+ """
11
+ Creates a triple-nested directory structure from the first 3 characters of the image_id.
12
+ """
13
+ subpath = Path(image_id[0]) / image_id[1] / image_id[2] / f"{image_id}.png"
14
+ return subpath
15
+
16
+
17
+ def _create_split_files(
18
+ train_df: pd.DataFrame,
19
+ test_df: pd.DataFrame,
20
+ raw_images_path: Path,
21
+ public_path: Path,
22
+ private_path: Path,
23
+ ):
24
+ """
25
+ Helper function to generate the directory structure and files for a given split.
26
+ This function populates the public and private directories with train/test data,
27
+ images, and a sample submission file.
28
+ """
29
+ # Create output directories
30
+ public_path.mkdir(exist_ok=True)
31
+ private_path.mkdir(exist_ok=True)
32
+
33
+ # Save dataframes
34
+ train_df.to_csv(public_path / "train_labels.csv", index=False)
35
+ test_df.to_csv(private_path / "test.csv", index=False)
36
+
37
+ # Copy train files
38
+ desc_prefix = public_path.name
39
+ for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=f"Copying {desc_prefix} train images"):
40
+ image_id = row["image_id"]
41
+ src = raw_images_path / make_image_subpath(image_id)
42
+ dst = public_path / "train" / make_image_subpath(image_id)
43
+ dst.parent.mkdir(parents=True, exist_ok=True)
44
+ shutil.copyfile(src=src, dst=dst)
45
+
46
+ # Copy test files
47
+ for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Copying {desc_prefix} test images"):
48
+ image_id = row["image_id"]
49
+ src = raw_images_path / make_image_subpath(image_id)
50
+ dst = public_path / "test" / make_image_subpath(image_id)
51
+ dst.parent.mkdir(parents=True, exist_ok=True)
52
+ shutil.copyfile(src=src, dst=dst)
53
+
54
+ # Create sample submission
55
+ sample_submission = test_df.copy()
56
+ sample_submission["InChI"] = "InChI=1S/H2O/h1H2"
57
+ sample_submission.to_csv(public_path / "sample_submission.csv", index=False)
58
+
59
+ # Checks
60
+ assert len(list((public_path / "train").glob("**/*.png"))) == len(
61
+ train_df
62
+ ), f"Expected {len(train_df)} train images in {public_path}/train, but got {len(list((public_path / 'train').glob('**/*.png')))}"
63
+ assert len(list((public_path / "test").glob("**/*.png"))) == len(
64
+ test_df
65
+ ), f"Expected {len(test_df)} test images in {public_path}/test, but got {len(list((public_path / 'test').glob('**/*.png')))}"
66
+
67
+ assert "image_id" in sample_submission.columns, "Sample submission must have 'image_id' column"
68
+ assert "InChI" in sample_submission.columns, "Sample submission must have 'InChI' column"
69
+ assert len(sample_submission) == len(
70
+ test_df
71
+ ), f"Expected {len(test_df)} images in sample submission, but got {len(sample_submission)}"
72
+
73
+
74
+ def prepare(raw: Path, public: Path, private: Path):
75
+ """
76
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
77
+ Also creates a secondary validation split in parallel public_val and private_val directories.
78
+ """
79
+ # Load train data
80
+ old_train = pd.read_csv(raw / "train_labels.csv")
81
+
82
+ # ---- 1. Create the original train/test split ----
83
+ # This split is for the main competition test set.
84
+ new_train, new_test = train_test_split(old_train, test_size=0.2, random_state=0)
85
+
86
+ # Generate the files for the original public and private directories
87
+ _create_split_files(
88
+ train_df=new_train,
89
+ test_df=new_test,
90
+ raw_images_path=raw / "train",
91
+ public_path=public,
92
+ private_path=private,
93
+ )
94
+
95
+ # ---- 2. Create the new validation split ----
96
+ # This second split is performed on the `new_train` set created above.
97
+ # We want the new `test_val` to be the same size as the original `new_test`.
98
+ # Original test size = 0.2 * total. Original train size = 0.8 * total.
99
+ # New test size relative to train set = 0.2 / 0.8 = 0.25
100
+ train_val, test_val = train_test_split(new_train, test_size=0.25, random_state=0)
101
+
102
+ # Define the new parallel directories for the validation set
103
+ public_val = public.parent / "public_val"
104
+ private_val = private.parent / "private_val"
105
+
106
+ # Generate the files for the new validation directories
107
+ _create_split_files(
108
+ train_df=train_val,
109
+ test_df=test_val,
110
+ raw_images_path=raw / "train",
111
+ public_path=public_val,
112
+ private_path=private_val,
113
+ )
114
+
115
+ # ---- 3. Copy shared files and run final checks ----
116
+
117
+ # Copy other files into both public and public_val directories
118
+ shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public / "extra_approved_InChIs.csv")
119
+ shutil.copyfile(src=raw / "extra_approved_InChIs.csv", dst=public_val / "extra_approved_InChIs.csv")
120
+
121
+ # Original split checks
122
+ assert len(new_train) + len(new_test) == len(
123
+ old_train
124
+ ), f"Expected {len(old_train)} total images in new_train ({len(new_train)}) and new_test ({len(new_test)})"
125
+
126
+ # New validation split checks
127
+ assert len(train_val) + len(test_val) == len(
128
+ new_train
129
+ ), f"Expected {len(new_train)} total images in train_val ({len(train_val)}) and test_val ({len(test_val)})"
130
+ # Ensure the size of the validation test set is approx. the same as the original test set
131
+ assert abs(len(test_val) - len(new_test)) <= 1, "Validation test set size should match original test set size"
@@ -0,0 +1,12 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="label", id_column="image_id"
10
+ )
11
+ score = accuracy_score(**accuracy_inputs)
12
+ return score
@@ -0,0 +1,113 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from sklearn.model_selection import train_test_split
5
+ from tqdm.auto import tqdm
6
+
7
+ from mlebench.competitions.utils import get_ids_from_tf_records
8
+ from mlebench.utils import read_csv
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+
13
+ # need to split based on the TFRecord files, since not mentioned in the CSVs
14
+ tfrecord_files = [
15
+ path
16
+ for path in sorted((raw / "train_tfrecords").iterdir())
17
+ if path.is_file() and path.suffix == ".tfrec"
18
+ ]
19
+
20
+ # In the original there are 21397 train samples and they say test has ~15000 test samples, which is ~ 0.4/0.6 test/train split
21
+ # We use 0.1 ratio to avoid removing too many samples from train
22
+ new_train_tfrecords, new_test_tfrecords = train_test_split(
23
+ tfrecord_files, test_size=0.1, random_state=0
24
+ )
25
+
26
+ # parse the IDs from the test tf records
27
+ test_ids = []
28
+ for path in new_test_tfrecords:
29
+ test_ids.extend(get_ids_from_tf_records(path))
30
+
31
+ old_train = read_csv(raw / "train.csv")
32
+
33
+ old_train["split"] = "train"
34
+ old_train.loc[old_train["image_id"].isin(test_ids), "split"] = "test"
35
+
36
+ new_train = old_train[old_train["split"] == "train"].drop(columns=["split"])
37
+ new_test = old_train[old_train["split"] == "test"].drop(columns=["split"])
38
+
39
+ sample_submission = new_test.copy()
40
+ sample_submission["label"] = 4
41
+
42
+ new_train.to_csv(public / "train.csv", index=False)
43
+ new_test.to_csv(private / "test.csv", index=False)
44
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
45
+
46
+ (public / "train_tfrecords").mkdir(parents=True, exist_ok=True)
47
+ for i, path in tqdm(
48
+ enumerate(new_train_tfrecords),
49
+ desc="Copying Train TFRecords",
50
+ total=len(new_train_tfrecords),
51
+ ):
52
+ length = path.stem.split("-")[1]
53
+ new_name = f"ld_train{i:02d}-{length}.tfrec"
54
+
55
+ shutil.copy(path, public / "train_tfrecords" / new_name)
56
+
57
+ (public / "test_tfrecords").mkdir(parents=True, exist_ok=True)
58
+ for i, path in tqdm(
59
+ enumerate(new_test_tfrecords), desc="Copying Test TFRecords", total=len(new_test_tfrecords)
60
+ ):
61
+ length = path.stem.split("-")[1]
62
+ new_name = f"ld_test{i:02d}-{length}.tfrec"
63
+
64
+ shutil.copy(path, public / "test_tfrecords" / new_name)
65
+
66
+ (public / "train_images").mkdir(parents=True, exist_ok=True)
67
+ for image_id in tqdm(new_train["image_id"], desc="Copying Train Images", total=len(new_train)):
68
+ shutil.copy(raw / "train_images" / image_id, public / "train_images")
69
+
70
+ (public / "test_images").mkdir(parents=True, exist_ok=True)
71
+ for image_id in tqdm(new_test["image_id"], desc="Copying Test Images", total=len(new_test)):
72
+ shutil.copy(raw / "train_images" / image_id, public / "test_images")
73
+
74
+ shutil.copy(raw / "label_num_to_disease_map.json", public / "label_num_to_disease_map.json")
75
+
76
+ # checks
77
+ assert len(new_train) + len(new_test) == len(
78
+ old_train
79
+ ), "Expected new train and new test lengths to sum to old train length"
80
+ assert len(sample_submission) == len(
81
+ new_test
82
+ ), "Expected sample submission length to be equal to new test length"
83
+
84
+ assert len(new_train) == sum(
85
+ 1 for _ in (public / "train_images").iterdir()
86
+ ), "Mismatch in number of expected train images copied"
87
+ assert len(new_test) == sum(
88
+ 1 for _ in (public / "test_images").iterdir()
89
+ ), "Mismatch in number of expected test images copied"
90
+
91
+ assert len(new_train_tfrecords) == sum(
92
+ 1 for _ in (public / "train_tfrecords").iterdir()
93
+ ), "Mismatch in number of expected train TFRecords copied"
94
+ assert len(new_test_tfrecords) == sum(
95
+ 1 for _ in (public / "test_tfrecords").iterdir()
96
+ ), "Mismatch in number of expected test TFRecords copied"
97
+
98
+ assert new_train.columns.tolist() == [
99
+ "image_id",
100
+ "label",
101
+ ], "Expected new train columns to be ['image_id', 'label']"
102
+ assert new_test.columns.tolist() == [
103
+ "image_id",
104
+ "label",
105
+ ], "Expected new test columns to be ['image_id', 'label']"
106
+ assert sample_submission.columns.tolist() == [
107
+ "image_id",
108
+ "label",
109
+ ], "Expected sample submission columns to be ['image_id', 'label']"
110
+
111
+ assert set(new_train["image_id"]).isdisjoint(
112
+ new_test["image_id"]
113
+ ), "Expected train and test image IDs to be disjoint"