dslighting 1.7.1__py3-none-any.whl → 1.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (352) hide show
  1. dslighting/__init__.py +1 -1
  2. dslighting/core/agent.py +78 -62
  3. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/METADATA +3 -1
  4. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/RECORD +352 -7
  5. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/top_level.txt +1 -0
  6. mlebench/README.md +39 -0
  7. mlebench/__init__.py +0 -0
  8. mlebench/cli.py +221 -0
  9. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/grade.py +161 -0
  10. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/mAP_evaluation.py +425 -0
  11. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare.py +483 -0
  12. mlebench/competitions/3d-object-detection-for-autonomous-vehicles/prepare_val.py +719 -0
  13. mlebench/competitions/AI4Code/grade.py +70 -0
  14. mlebench/competitions/AI4Code/prepare.py +84 -0
  15. mlebench/competitions/AI4Code/prepare_val.py +159 -0
  16. mlebench/competitions/__init__.py +0 -0
  17. mlebench/competitions/aerial-cactus-identification/grade.py +11 -0
  18. mlebench/competitions/aerial-cactus-identification/prepare.py +71 -0
  19. mlebench/competitions/aerial-cactus-identification/prepare_val.py +133 -0
  20. mlebench/competitions/alaska2-image-steganalysis/grade.py +136 -0
  21. mlebench/competitions/alaska2-image-steganalysis/prepare.py +88 -0
  22. mlebench/competitions/alaska2-image-steganalysis/prepare_val.py +148 -0
  23. mlebench/competitions/aptos2019-blindness-detection/grade.py +35 -0
  24. mlebench/competitions/aptos2019-blindness-detection/prepare.py +75 -0
  25. mlebench/competitions/aptos2019-blindness-detection/prepare_val.py +123 -0
  26. mlebench/competitions/bike-sharing-demand/__init__.py +0 -0
  27. mlebench/competitions/bike-sharing-demand/grade.py +55 -0
  28. mlebench/competitions/bike-sharing-demand/prepare.py +37 -0
  29. mlebench/competitions/billion-word-imputation/grade.py +37 -0
  30. mlebench/competitions/billion-word-imputation/prepare.py +107 -0
  31. mlebench/competitions/billion-word-imputation/prepare_val.py +179 -0
  32. mlebench/competitions/bms-molecular-translation/grade.py +40 -0
  33. mlebench/competitions/bms-molecular-translation/prepare.py +68 -0
  34. mlebench/competitions/bms-molecular-translation/prepare_val.py +131 -0
  35. mlebench/competitions/cassava-leaf-disease-classification/grade.py +12 -0
  36. mlebench/competitions/cassava-leaf-disease-classification/prepare.py +113 -0
  37. mlebench/competitions/cassava-leaf-disease-classification/prepare_val.py +186 -0
  38. mlebench/competitions/cdiscount-image-classification-challenge/grade.py +11 -0
  39. mlebench/competitions/cdiscount-image-classification-challenge/prepare.py +144 -0
  40. mlebench/competitions/cdiscount-image-classification-challenge/prepare_val.py +205 -0
  41. mlebench/competitions/chaii-hindi-and-tamil-question-answering/grade.py +67 -0
  42. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare.py +31 -0
  43. mlebench/competitions/chaii-hindi-and-tamil-question-answering/prepare_val.py +94 -0
  44. mlebench/competitions/champs-scalar-coupling/grade.py +60 -0
  45. mlebench/competitions/champs-scalar-coupling/prepare.py +116 -0
  46. mlebench/competitions/champs-scalar-coupling/prepare_val.py +155 -0
  47. mlebench/competitions/conways-reverse-game-of-life-2020/__init__.py +0 -0
  48. mlebench/competitions/conways-reverse-game-of-life-2020/grade.py +40 -0
  49. mlebench/competitions/conways-reverse-game-of-life-2020/prepare.py +41 -0
  50. mlebench/competitions/demand-forecasting-kernels-only/__init__.py +0 -0
  51. mlebench/competitions/demand-forecasting-kernels-only/grade.py +66 -0
  52. mlebench/competitions/demand-forecasting-kernels-only/prepare.py +27 -0
  53. mlebench/competitions/demand_forecasting_kernels_only/__init__.py +0 -0
  54. mlebench/competitions/demand_forecasting_kernels_only/grade.py +66 -0
  55. mlebench/competitions/demand_forecasting_kernels_only/prepare.py +27 -0
  56. mlebench/competitions/denoising-dirty-documents/grade.py +44 -0
  57. mlebench/competitions/denoising-dirty-documents/prepare.py +134 -0
  58. mlebench/competitions/denoising-dirty-documents/prepare_val.py +178 -0
  59. mlebench/competitions/detecting-insults-in-social-commentary/grade.py +11 -0
  60. mlebench/competitions/detecting-insults-in-social-commentary/prepare.py +72 -0
  61. mlebench/competitions/detecting-insults-in-social-commentary/prepare_val.py +128 -0
  62. mlebench/competitions/dog-breed-identification/dogs.py +124 -0
  63. mlebench/competitions/dog-breed-identification/grade.py +42 -0
  64. mlebench/competitions/dog-breed-identification/prepare.py +55 -0
  65. mlebench/competitions/dog-breed-identification/prepare_val.py +104 -0
  66. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/grade.py +43 -0
  67. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare.py +70 -0
  68. mlebench/competitions/dogs-vs-cats-redux-kernels-edition/prepare_val.py +143 -0
  69. mlebench/competitions/ethanol-concentration/grade.py +23 -0
  70. mlebench/competitions/ethanol-concentration/prepare.py +90 -0
  71. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/grade.py +60 -0
  72. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare.py +41 -0
  73. mlebench/competitions/facebook-recruiting-iii-keyword-extraction/prepare_val.py +92 -0
  74. mlebench/competitions/feedback-prize-english-language-learning/__init__.py +0 -0
  75. mlebench/competitions/feedback-prize-english-language-learning/grade.py +60 -0
  76. mlebench/competitions/feedback-prize-english-language-learning/prepare.py +39 -0
  77. mlebench/competitions/freesound-audio-tagging-2019/grade.py +64 -0
  78. mlebench/competitions/freesound-audio-tagging-2019/prepare.py +94 -0
  79. mlebench/competitions/freesound-audio-tagging-2019/prepare_val.py +175 -0
  80. mlebench/competitions/freesound-audio-tagging-2019/vocabulary.py +83 -0
  81. mlebench/competitions/google-quest-challenge/classes.py +32 -0
  82. mlebench/competitions/google-quest-challenge/grade.py +45 -0
  83. mlebench/competitions/google-quest-challenge/prepare.py +58 -0
  84. mlebench/competitions/google-quest-challenge/prepare_val.py +120 -0
  85. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/grade.py +77 -0
  86. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py +155 -0
  87. mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare_val.py +211 -0
  88. mlebench/competitions/h-and-m-personalized-fashion-recommendations/grade.py +42 -0
  89. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare.py +102 -0
  90. mlebench/competitions/h-and-m-personalized-fashion-recommendations/prepare_val.py +132 -0
  91. mlebench/competitions/handwriting/grade.py +23 -0
  92. mlebench/competitions/handwriting/prepare.py +179 -0
  93. mlebench/competitions/herbarium-2020-fgvc7/grade.py +34 -0
  94. mlebench/competitions/herbarium-2020-fgvc7/prepare.py +251 -0
  95. mlebench/competitions/herbarium-2020-fgvc7/prepare_val.py +242 -0
  96. mlebench/competitions/herbarium-2021-fgvc8/grade.py +34 -0
  97. mlebench/competitions/herbarium-2021-fgvc8/prepare.py +251 -0
  98. mlebench/competitions/herbarium-2021-fgvc8/prepare_val.py +222 -0
  99. mlebench/competitions/herbarium-2022-fgvc9/grade.py +31 -0
  100. mlebench/competitions/herbarium-2022-fgvc9/prepare.py +233 -0
  101. mlebench/competitions/herbarium-2022-fgvc9/prepare_val.py +213 -0
  102. mlebench/competitions/histopathologic-cancer-detection/grade.py +12 -0
  103. mlebench/competitions/histopathologic-cancer-detection/prepare.py +59 -0
  104. mlebench/competitions/histopathologic-cancer-detection/prepare_val.py +131 -0
  105. mlebench/competitions/hms-harmful-brain-activity-classification/constants.py +9 -0
  106. mlebench/competitions/hms-harmful-brain-activity-classification/grade.py +43 -0
  107. mlebench/competitions/hms-harmful-brain-activity-classification/kaggle_metric_utilities.py +96 -0
  108. mlebench/competitions/hms-harmful-brain-activity-classification/kullback_leibler_divergence.py +118 -0
  109. mlebench/competitions/hms-harmful-brain-activity-classification/prepare.py +121 -0
  110. mlebench/competitions/hms-harmful-brain-activity-classification/prepare_val.py +190 -0
  111. mlebench/competitions/hotel-id-2021-fgvc8/grade.py +41 -0
  112. mlebench/competitions/hotel-id-2021-fgvc8/prepare.py +63 -0
  113. mlebench/competitions/hotel-id-2021-fgvc8/prepare_val.py +132 -0
  114. mlebench/competitions/hubmap-kidney-segmentation/grade.py +62 -0
  115. mlebench/competitions/hubmap-kidney-segmentation/prepare.py +108 -0
  116. mlebench/competitions/hubmap-kidney-segmentation/prepare_val.py +153 -0
  117. mlebench/competitions/icecube-neutrinos-in-deep-ice/grade.py +111 -0
  118. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare.py +127 -0
  119. mlebench/competitions/icecube-neutrinos-in-deep-ice/prepare_val.py +183 -0
  120. mlebench/competitions/ili/grade.py +60 -0
  121. mlebench/competitions/ili/prepare.py +99 -0
  122. mlebench/competitions/imet-2020-fgvc7/grade.py +54 -0
  123. mlebench/competitions/imet-2020-fgvc7/prepare.py +77 -0
  124. mlebench/competitions/imet-2020-fgvc7/prepare_val.py +157 -0
  125. mlebench/competitions/inaturalist-2019-fgvc6/grade.py +35 -0
  126. mlebench/competitions/inaturalist-2019-fgvc6/prepare.py +259 -0
  127. mlebench/competitions/inaturalist-2019-fgvc6/prepare_val.py +304 -0
  128. mlebench/competitions/instant-gratification/__init__.py +0 -0
  129. mlebench/competitions/instant-gratification/grade.py +55 -0
  130. mlebench/competitions/instant-gratification/prepare.py +25 -0
  131. mlebench/competitions/instant_gratification/__init__.py +0 -0
  132. mlebench/competitions/instant_gratification/grade.py +55 -0
  133. mlebench/competitions/instant_gratification/prepare.py +25 -0
  134. mlebench/competitions/invasive-species-monitoring/grade.py +11 -0
  135. mlebench/competitions/invasive-species-monitoring/prepare.py +97 -0
  136. mlebench/competitions/invasive-species-monitoring/prepare_val.py +164 -0
  137. mlebench/competitions/iwildcam-2019-fgvc6/grade.py +44 -0
  138. mlebench/competitions/iwildcam-2019-fgvc6/prepare.py +118 -0
  139. mlebench/competitions/iwildcam-2019-fgvc6/prepare_val.py +194 -0
  140. mlebench/competitions/iwildcam-2020-fgvc7/grade.py +11 -0
  141. mlebench/competitions/iwildcam-2020-fgvc7/prepare.py +164 -0
  142. mlebench/competitions/iwildcam-2020-fgvc7/prepare_val.py +245 -0
  143. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/classes.py +1 -0
  144. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/grade.py +54 -0
  145. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare.py +42 -0
  146. mlebench/competitions/jigsaw-toxic-comment-classification-challenge/prepare_val.py +88 -0
  147. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/grade.py +153 -0
  148. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare.py +36 -0
  149. mlebench/competitions/jigsaw-unintended-bias-in-toxicity-classification/prepare_val.py +117 -0
  150. mlebench/competitions/kuzushiji-recognition/grade.py +58 -0
  151. mlebench/competitions/kuzushiji-recognition/kuzushiji_metric.py +118 -0
  152. mlebench/competitions/kuzushiji-recognition/prepare.py +92 -0
  153. mlebench/competitions/kuzushiji-recognition/prepare_val.py +149 -0
  154. mlebench/competitions/leaf-classification/classes.py +101 -0
  155. mlebench/competitions/leaf-classification/grade.py +44 -0
  156. mlebench/competitions/leaf-classification/prepare.py +60 -0
  157. mlebench/competitions/leaf-classification/prepare_val.py +116 -0
  158. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/grade.py +44 -0
  159. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare.py +51 -0
  160. mlebench/competitions/learning-agency-lab-automated-essay-scoring-2/prepare_val.py +96 -0
  161. mlebench/competitions/liverpool-ion-switching/__init__.py +0 -0
  162. mlebench/competitions/liverpool-ion-switching/grade.py +52 -0
  163. mlebench/competitions/liverpool-ion-switching/prepare.py +27 -0
  164. mlebench/competitions/liverpool_ion_switching/__init__.py +0 -0
  165. mlebench/competitions/liverpool_ion_switching/grade.py +52 -0
  166. mlebench/competitions/liverpool_ion_switching/prepare.py +27 -0
  167. mlebench/competitions/lmsys-chatbot-arena/grade.py +63 -0
  168. mlebench/competitions/lmsys-chatbot-arena/prepare.py +52 -0
  169. mlebench/competitions/lmsys-chatbot-arena/prepare_val.py +115 -0
  170. mlebench/competitions/mcm_2024_c_test/grade.py +107 -0
  171. mlebench/competitions/mcm_2024_c_test/prepare.py +2 -0
  172. mlebench/competitions/ml2021spring-hw2/grade.py +11 -0
  173. mlebench/competitions/ml2021spring-hw2/prepare.py +58 -0
  174. mlebench/competitions/ml2021spring-hw2/prepare_val.py +135 -0
  175. mlebench/competitions/mlsp-2013-birds/grade.py +11 -0
  176. mlebench/competitions/mlsp-2013-birds/prepare.py +182 -0
  177. mlebench/competitions/mlsp-2013-birds/prepare_val.py +241 -0
  178. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/grade.py +11 -0
  179. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare.py +58 -0
  180. mlebench/competitions/movie-review-sentiment-analysis-kernels-only/prepare_val.py +120 -0
  181. mlebench/competitions/multi-modal-gesture-recognition/grade.py +58 -0
  182. mlebench/competitions/multi-modal-gesture-recognition/prepare.py +85 -0
  183. mlebench/competitions/multi-modal-gesture-recognition/prepare_val.py +139 -0
  184. mlebench/competitions/my-custom-task-01/prepare.py +2 -0
  185. mlebench/competitions/new-my-task-01/prepare.py +2 -0
  186. mlebench/competitions/new-my-task-03/grade.py +107 -0
  187. mlebench/competitions/new-my-task-03/prepare.py +2 -0
  188. mlebench/competitions/new-york-city-taxi-fare-prediction/grade.py +28 -0
  189. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare.py +44 -0
  190. mlebench/competitions/new-york-city-taxi-fare-prediction/prepare_val.py +89 -0
  191. mlebench/competitions/nfl-player-contact-detection/grade.py +36 -0
  192. mlebench/competitions/nfl-player-contact-detection/prepare.py +101 -0
  193. mlebench/competitions/nfl-player-contact-detection/prepare_val.py +186 -0
  194. mlebench/competitions/nomad2018-predict-transparent-conductors/grade.py +47 -0
  195. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare.py +77 -0
  196. mlebench/competitions/nomad2018-predict-transparent-conductors/prepare_val.py +144 -0
  197. mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py +74 -0
  198. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py +95 -0
  199. mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare_val.py +167 -0
  200. mlebench/competitions/paddy-disease-classification/grade.py +35 -0
  201. mlebench/competitions/paddy-disease-classification/prepare.py +69 -0
  202. mlebench/competitions/paddy-disease-classification/prepare_val.py +122 -0
  203. mlebench/competitions/petfinder-pawpularity-score/grade.py +41 -0
  204. mlebench/competitions/petfinder-pawpularity-score/prepare.py +76 -0
  205. mlebench/competitions/petfinder-pawpularity-score/prepare_val.py +154 -0
  206. mlebench/competitions/plant-pathology-2020-fgvc7/grade.py +41 -0
  207. mlebench/competitions/plant-pathology-2020-fgvc7/prepare.py +74 -0
  208. mlebench/competitions/plant-pathology-2020-fgvc7/prepare_val.py +160 -0
  209. mlebench/competitions/plant-pathology-2021-fgvc8/grade.py +54 -0
  210. mlebench/competitions/plant-pathology-2021-fgvc8/prepare.py +65 -0
  211. mlebench/competitions/plant-pathology-2021-fgvc8/prepare_val.py +130 -0
  212. mlebench/competitions/plant-seedlings-classification/grade.py +39 -0
  213. mlebench/competitions/plant-seedlings-classification/prepare.py +91 -0
  214. mlebench/competitions/plant-seedlings-classification/prepare_val.py +158 -0
  215. mlebench/competitions/playground-series-s3e1/__init__.py +0 -0
  216. mlebench/competitions/playground-series-s3e1/grade.py +52 -0
  217. mlebench/competitions/playground-series-s3e1/prepare.py +25 -0
  218. mlebench/competitions/playground-series-s3e11/__init__.py +0 -0
  219. mlebench/competitions/playground-series-s3e11/grade.py +55 -0
  220. mlebench/competitions/playground-series-s3e11/prepare.py +25 -0
  221. mlebench/competitions/playground-series-s3e18/grade.py +39 -0
  222. mlebench/competitions/playground-series-s3e18/prepare.py +36 -0
  223. mlebench/competitions/playground-series-s3e18/prepare_val.py +89 -0
  224. mlebench/competitions/playground_series_s3e1/__init__.py +0 -0
  225. mlebench/competitions/playground_series_s3e1/grade.py +52 -0
  226. mlebench/competitions/playground_series_s3e1/prepare.py +25 -0
  227. mlebench/competitions/playground_series_s3e11/__init__.py +0 -0
  228. mlebench/competitions/playground_series_s3e11/grade.py +55 -0
  229. mlebench/competitions/playground_series_s3e11/prepare.py +25 -0
  230. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/grade.py +44 -0
  231. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare.py +68 -0
  232. mlebench/competitions/predict-volcanic-eruptions-ingv-oe/prepare_val.py +146 -0
  233. mlebench/competitions/random-acts-of-pizza/grade.py +14 -0
  234. mlebench/competitions/random-acts-of-pizza/prepare.py +80 -0
  235. mlebench/competitions/random-acts-of-pizza/prepare_val.py +144 -0
  236. mlebench/competitions/ranzcr-clip-catheter-line-classification/classes.py +11 -0
  237. mlebench/competitions/ranzcr-clip-catheter-line-classification/grade.py +31 -0
  238. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare.py +53 -0
  239. mlebench/competitions/ranzcr-clip-catheter-line-classification/prepare_val.py +113 -0
  240. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/grade.py +124 -0
  241. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py +219 -0
  242. mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare_val.py +257 -0
  243. mlebench/competitions/rsna-breast-cancer-detection/grade.py +65 -0
  244. mlebench/competitions/rsna-breast-cancer-detection/prepare.py +141 -0
  245. mlebench/competitions/rsna-breast-cancer-detection/prepare_val.py +201 -0
  246. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/grade.py +13 -0
  247. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare.py +47 -0
  248. mlebench/competitions/rsna-miccai-brain-tumor-radiogenomic-classification/prepare_val.py +97 -0
  249. mlebench/competitions/santander-customer-satisfaction/grade.py +10 -0
  250. mlebench/competitions/santander-customer-satisfaction/prepare.py +41 -0
  251. mlebench/competitions/sciencebench-001-clintox-nn/__init__.py +0 -0
  252. mlebench/competitions/sciencebench-001-clintox-nn/grade.py +56 -0
  253. mlebench/competitions/sciencebench-001-clintox-nn/prepare.py +75 -0
  254. mlebench/competitions/sciencebench-015-aai/grade.py +37 -0
  255. mlebench/competitions/sciencebench-015-aai/prepare.py +102 -0
  256. mlebench/competitions/sciencebench-051-brain-blood-qsar/grade.py +58 -0
  257. mlebench/competitions/sciencebench-051-brain-blood-qsar/prepare.py +69 -0
  258. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/grade.py +55 -0
  259. mlebench/competitions/sciencebench-101-experimental-band-gap-prediction/prepare.py +88 -0
  260. mlebench/competitions/see-click-predict-fix/__init__.py +0 -0
  261. mlebench/competitions/see-click-predict-fix/grade.py +66 -0
  262. mlebench/competitions/see-click-predict-fix/prepare.py +25 -0
  263. mlebench/competitions/see_click_predict_fix/__init__.py +0 -0
  264. mlebench/competitions/see_click_predict_fix/grade.py +66 -0
  265. mlebench/competitions/see_click_predict_fix/prepare.py +25 -0
  266. mlebench/competitions/seti-breakthrough-listen/grade.py +11 -0
  267. mlebench/competitions/seti-breakthrough-listen/prepare.py +71 -0
  268. mlebench/competitions/seti-breakthrough-listen/prepare_val.py +159 -0
  269. mlebench/competitions/siim-covid19-detection/grade.py +194 -0
  270. mlebench/competitions/siim-covid19-detection/prepare.py +123 -0
  271. mlebench/competitions/siim-covid19-detection/prepare_val.py +164 -0
  272. mlebench/competitions/siim-isic-melanoma-classification/grade.py +11 -0
  273. mlebench/competitions/siim-isic-melanoma-classification/prepare.py +127 -0
  274. mlebench/competitions/siim-isic-melanoma-classification/prepare_val.py +158 -0
  275. mlebench/competitions/smartphone-decimeter-2022/grade.py +55 -0
  276. mlebench/competitions/smartphone-decimeter-2022/notebook.py +86 -0
  277. mlebench/competitions/smartphone-decimeter-2022/prepare.py +143 -0
  278. mlebench/competitions/smartphone-decimeter-2022/prepare_val.py +199 -0
  279. mlebench/competitions/spaceship-titanic/grade.py +11 -0
  280. mlebench/competitions/spaceship-titanic/prepare.py +23 -0
  281. mlebench/competitions/spaceship-titanic/prepare_val.py +61 -0
  282. mlebench/competitions/spooky-author-identification/classes.py +1 -0
  283. mlebench/competitions/spooky-author-identification/grade.py +38 -0
  284. mlebench/competitions/spooky-author-identification/prepare.py +40 -0
  285. mlebench/competitions/spooky-author-identification/prepare_val.py +78 -0
  286. mlebench/competitions/stanford-covid-vaccine/grade.py +65 -0
  287. mlebench/competitions/stanford-covid-vaccine/prepare.py +129 -0
  288. mlebench/competitions/stanford-covid-vaccine/prepare_val.py +199 -0
  289. mlebench/competitions/statoil-iceberg-classifier-challenge/grade.py +41 -0
  290. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare.py +105 -0
  291. mlebench/competitions/statoil-iceberg-classifier-challenge/prepare_val.py +157 -0
  292. mlebench/competitions/tabular-playground-series-dec-2021/grade.py +11 -0
  293. mlebench/competitions/tabular-playground-series-dec-2021/prepare.py +39 -0
  294. mlebench/competitions/tabular-playground-series-dec-2021/prepare_val.py +99 -0
  295. mlebench/competitions/tabular-playground-series-may-2022/grade.py +9 -0
  296. mlebench/competitions/tabular-playground-series-may-2022/prepare.py +56 -0
  297. mlebench/competitions/tabular-playground-series-may-2022/prepare_val.py +116 -0
  298. mlebench/competitions/tensorflow-speech-recognition-challenge/grade.py +11 -0
  299. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare.py +90 -0
  300. mlebench/competitions/tensorflow-speech-recognition-challenge/prepare_val.py +148 -0
  301. mlebench/competitions/tensorflow2-question-answering/grade.py +122 -0
  302. mlebench/competitions/tensorflow2-question-answering/prepare.py +122 -0
  303. mlebench/competitions/tensorflow2-question-answering/prepare_val.py +187 -0
  304. mlebench/competitions/text-normalization-challenge-english-language/grade.py +49 -0
  305. mlebench/competitions/text-normalization-challenge-english-language/prepare.py +115 -0
  306. mlebench/competitions/text-normalization-challenge-english-language/prepare_val.py +213 -0
  307. mlebench/competitions/text-normalization-challenge-russian-language/grade.py +49 -0
  308. mlebench/competitions/text-normalization-challenge-russian-language/prepare.py +113 -0
  309. mlebench/competitions/text-normalization-challenge-russian-language/prepare_val.py +165 -0
  310. mlebench/competitions/tgs-salt-identification-challenge/grade.py +144 -0
  311. mlebench/competitions/tgs-salt-identification-challenge/prepare.py +158 -0
  312. mlebench/competitions/tgs-salt-identification-challenge/prepare_val.py +166 -0
  313. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/grade.py +11 -0
  314. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare.py +95 -0
  315. mlebench/competitions/the-icml-2013-whale-challenge-right-whale-redux/prepare_val.py +141 -0
  316. mlebench/competitions/tmdb-box-office-prediction/__init__.py +0 -0
  317. mlebench/competitions/tmdb-box-office-prediction/grade.py +55 -0
  318. mlebench/competitions/tmdb-box-office-prediction/prepare.py +35 -0
  319. mlebench/competitions/tweet-sentiment-extraction/grade.py +67 -0
  320. mlebench/competitions/tweet-sentiment-extraction/prepare.py +36 -0
  321. mlebench/competitions/tweet-sentiment-extraction/prepare_val.py +106 -0
  322. mlebench/competitions/us-patent-phrase-to-phrase-matching/grade.py +31 -0
  323. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare.py +33 -0
  324. mlebench/competitions/us-patent-phrase-to-phrase-matching/prepare_val.py +71 -0
  325. mlebench/competitions/utils.py +266 -0
  326. mlebench/competitions/uw-madison-gi-tract-image-segmentation/grade.py +158 -0
  327. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py +139 -0
  328. mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare_val.py +193 -0
  329. mlebench/competitions/ventilator-pressure-prediction/__init__.py +0 -0
  330. mlebench/competitions/ventilator-pressure-prediction/grade.py +52 -0
  331. mlebench/competitions/ventilator-pressure-prediction/prepare.py +27 -0
  332. mlebench/competitions/ventilator-pressure-prediction/prepare_val.py +142 -0
  333. mlebench/competitions/ventilator_pressure_prediction/__init__.py +0 -0
  334. mlebench/competitions/ventilator_pressure_prediction/grade.py +52 -0
  335. mlebench/competitions/ventilator_pressure_prediction/prepare.py +27 -0
  336. mlebench/competitions/vesuvius-challenge-ink-detection/grade.py +97 -0
  337. mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py +122 -0
  338. mlebench/competitions/vesuvius-challenge-ink-detection/prepare_val.py +170 -0
  339. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py +220 -0
  340. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py +129 -0
  341. mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare_val.py +204 -0
  342. mlebench/competitions/whale-categorization-playground/grade.py +41 -0
  343. mlebench/competitions/whale-categorization-playground/prepare.py +103 -0
  344. mlebench/competitions/whale-categorization-playground/prepare_val.py +196 -0
  345. mlebench/data.py +420 -0
  346. mlebench/grade.py +209 -0
  347. mlebench/grade_helpers.py +235 -0
  348. mlebench/metrics.py +75 -0
  349. mlebench/registry.py +332 -0
  350. mlebench/utils.py +346 -0
  351. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/WHEEL +0 -0
  352. {dslighting-1.7.1.dist-info → dslighting-1.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,186 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm.auto import tqdm
7
+
8
+ from mlebench.competitions.utils import get_ids_from_tf_records
9
+ from mlebench.utils import read_csv
10
+
11
+
12
+ def _process_and_save_split(
13
+ train_df: pd.DataFrame,
14
+ test_df: pd.DataFrame,
15
+ train_tfrecords: list,
16
+ test_tfrecords: list,
17
+ public_dir: Path,
18
+ private_dir: Path,
19
+ raw_dir: Path,
20
+ ):
21
+ """
22
+ Helper function to process a single data split (train/test) and save all
23
+ associated files to the specified public and private directories.
24
+ """
25
+ # Create output directories
26
+ public_dir.mkdir(parents=True, exist_ok=True)
27
+ private_dir.mkdir(parents=True, exist_ok=True)
28
+
29
+ # --- Create and save CSV files ---
30
+ sample_submission = test_df.copy()
31
+ sample_submission["label"] = 4 # Default label for submission template
32
+
33
+ train_df.to_csv(public_dir / "train.csv", index=False)
34
+ test_df.to_csv(private_dir / "test.csv", index=False) # Ground truth
35
+ sample_submission.to_csv(public_dir / "sample_submission.csv", index=False)
36
+
37
+ # --- Copy and rename TFRecord files ---
38
+ (public_dir / "train_tfrecords").mkdir(exist_ok=True)
39
+ for i, path in tqdm(
40
+ enumerate(train_tfrecords),
41
+ desc=f"Copying Train TFRecords to {public_dir.name}",
42
+ total=len(train_tfrecords),
43
+ ):
44
+ length = path.stem.split("-")[1]
45
+ new_name = f"ld_train{i:02d}-{length}.tfrec"
46
+ shutil.copy(path, public_dir / "train_tfrecords" / new_name)
47
+
48
+ (public_dir / "test_tfrecords").mkdir(exist_ok=True)
49
+ for i, path in tqdm(
50
+ enumerate(test_tfrecords),
51
+ desc=f"Copying Test TFRecords to {public_dir.name}",
52
+ total=len(test_tfrecords),
53
+ ):
54
+ length = path.stem.split("-")[1]
55
+ new_name = f"ld_test{i:02d}-{length}.tfrec"
56
+ shutil.copy(path, public_dir / "test_tfrecords" / new_name)
57
+
58
+ # --- Copy image files ---
59
+ (public_dir / "train_images").mkdir(exist_ok=True)
60
+ for image_id in tqdm(
61
+ train_df["image_id"],
62
+ desc=f"Copying Train Images to {public_dir.name}",
63
+ total=len(train_df),
64
+ ):
65
+ shutil.copy(raw_dir / "train_images" / image_id, public_dir / "train_images")
66
+
67
+ (public_dir / "test_images").mkdir(exist_ok=True)
68
+ for image_id in tqdm(
69
+ test_df["image_id"],
70
+ desc=f"Copying Test Images to {public_dir.name}",
71
+ total=len(test_df),
72
+ ):
73
+ shutil.copy(raw_dir / "train_images" / image_id, public_dir / "test_images")
74
+
75
+ # --- Copy auxiliary files ---
76
+ shutil.copy(raw_dir / "label_num_to_disease_map.json", public_dir)
77
+
78
+ # --- Perform checks for this split ---
79
+ assert len(train_df) + len(test_df) == len(train_df) + len(
80
+ test_df
81
+ ), f"Length check failed for {public_dir.name}"
82
+ assert len(sample_submission) == len(
83
+ test_df
84
+ ), f"Sample submission length mismatch for {public_dir.name}"
85
+
86
+ assert len(train_df) == sum(
87
+ 1 for _ in (public_dir / "train_images").iterdir()
88
+ ), f"Train image count mismatch in {public_dir.name}"
89
+ assert len(test_df) == sum(
90
+ 1 for _ in (public_dir / "test_images").iterdir()
91
+ ), f"Test image count mismatch in {public_dir.name}"
92
+
93
+ assert len(train_tfrecords) == sum(
94
+ 1 for _ in (public_dir / "train_tfrecords").iterdir()
95
+ ), f"Train TFRecord count mismatch in {public_dir.name}"
96
+ assert len(test_tfrecords) == sum(
97
+ 1 for _ in (public_dir / "test_tfrecords").iterdir()
98
+ ), f"Test TFRecord count mismatch in {public_dir.name}"
99
+
100
+ assert train_df.columns.tolist() == [
101
+ "image_id",
102
+ "label",
103
+ ], f"Train columns mismatch for {public_dir.name}"
104
+ assert test_df.columns.tolist() == [
105
+ "image_id",
106
+ "label",
107
+ ], f"Test columns mismatch for {public_dir.name}"
108
+ assert sample_submission.columns.tolist() == [
109
+ "image_id",
110
+ "label",
111
+ ], f"Sample submission columns mismatch for {public_dir.name}"
112
+
113
+ assert set(train_df["image_id"]).isdisjoint(
114
+ test_df["image_id"]
115
+ ), f"Train and test image IDs are not disjoint for {public_dir.name}"
116
+
117
+
118
+ def prepare(raw: Path, public: Path, private: Path):
119
+ # Define paths for the new validation split
120
+ public_val = public.parent / "public_val"
121
+ private_val = private.parent / "private_val"
122
+
123
+ # need to split based on the TFRecord files, since not mentioned in the CSVs
124
+ tfrecord_files = [
125
+ path
126
+ for path in sorted((raw / "train_tfrecords").iterdir())
127
+ if path.is_file() and path.suffix == ".tfrec"
128
+ ]
129
+
130
+ # --- FIRST SPLIT: Create original train and test sets ---
131
+ # In the original there are 21397 train samples and they say test has ~15000 test samples, which is ~ 0.4/0.6 test/train split
132
+ # We use 0.1 ratio to avoid removing too many samples from train
133
+ train_tfrecords, test_tfrecords = train_test_split(
134
+ tfrecord_files, test_size=0.1, random_state=0
135
+ )
136
+
137
+ # parse the IDs from the test tf records
138
+ test_ids = []
139
+ for path in test_tfrecords:
140
+ test_ids.extend(get_ids_from_tf_records(path))
141
+
142
+ # Create dataframes for the first split
143
+ full_train_df = read_csv(raw / "train.csv")
144
+ train_df = full_train_df[~full_train_df["image_id"].isin(test_ids)].copy()
145
+ test_df = full_train_df[full_train_df["image_id"].isin(test_ids)].copy()
146
+
147
+ # Process and save the original split to 'public' and 'private'
148
+ # This ensures the original outputs are untouched
149
+ _process_and_save_split(
150
+ train_df=train_df,
151
+ test_df=test_df,
152
+ train_tfrecords=train_tfrecords,
153
+ test_tfrecords=test_tfrecords,
154
+ public_dir=public,
155
+ private_dir=private,
156
+ raw_dir=raw,
157
+ )
158
+
159
+ # --- SECOND SPLIT: Create new training and validation sets from the original train set ---
160
+ # The new training set from the first split becomes the source for this second split.
161
+ # A test_size of 1/9 on the train_tfrecords (which is 90% of the original data)
162
+ # results in a validation set that is 10% of the original total, matching the
163
+ # original test set size. (1/9 * 0.9 = 0.1)
164
+ train_val_tfrecords, test_val_tfrecords = train_test_split(
165
+ train_tfrecords, test_size=1 / 9, random_state=0
166
+ )
167
+
168
+ # Parse IDs for the validation set
169
+ test_val_ids = []
170
+ for path in test_val_tfrecords:
171
+ test_val_ids.extend(get_ids_from_tf_records(path))
172
+
173
+ # Create dataframes for the validation split using the original train_df
174
+ train_val_df = train_df[~train_df["image_id"].isin(test_val_ids)].copy()
175
+ test_val_df = train_df[train_df["image_id"].isin(test_val_ids)].copy()
176
+
177
+ # Process and save the validation split to 'public_val' and 'private_val'
178
+ _process_and_save_split(
179
+ train_df=train_val_df,
180
+ test_df=test_val_df,
181
+ train_tfrecords=train_val_tfrecords,
182
+ test_tfrecords=test_val_tfrecords,
183
+ public_dir=public_val,
184
+ private_dir=private_val,
185
+ raw_dir=raw,
186
+ )
@@ -0,0 +1,11 @@
1
+ import pandas as pd
2
+ from sklearn.metrics import accuracy_score
3
+
4
+ from mlebench.competitions.utils import prepare_for_accuracy_metric
5
+
6
+
7
+ def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
8
+ accuracy_inputs = prepare_for_accuracy_metric(
9
+ submission=submission, answers=answers, target_column="category_id", id_column="_id"
10
+ )
11
+ return accuracy_score(**accuracy_inputs)
@@ -0,0 +1,144 @@
1
+ import shutil
2
+ from itertools import islice
3
+ from pathlib import Path
4
+
5
+ import bson
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+ from tqdm import tqdm
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ """
13
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
14
+ """
15
+
16
+ dev_mode = False
17
+
18
+ def read_ids_and_category_ids(file_path: Path) -> pd.DataFrame:
19
+ data = bson.decode_file_iter(open(file_path, "rb"))
20
+
21
+ records = []
22
+
23
+ for c, d in enumerate(tqdm(data, desc="Reading BSON data")):
24
+ records.append({"_id": d["_id"], "category_id": d["category_id"]})
25
+
26
+ return pd.DataFrame(records)
27
+
28
+ def filter_bson_by_ids(
29
+ bson_file_path: Path,
30
+ ids: set,
31
+ write_path: Path,
32
+ exclude_cols: list = [],
33
+ chunk_size=1000,
34
+ max_rows=None,
35
+ ):
36
+ """
37
+ Filters a BSON file by a set of IDs and writes the filtered data to a new BSON file.
38
+ The original _id is replaced with a new _id starting from 0 and incrementing by 1.
39
+
40
+ Args:
41
+ bson_file_path (Path): Path to the input BSON file.
42
+ ids (set): Set of IDs to filter by.
43
+ write_path (Path): Path to the output BSON file.
44
+ exclude_cols (list): List of columns to exclude from the output.
45
+ max_rows (int, optional): Maximum number of rows to write to the output file.
46
+ """
47
+ data = bson.decode_file_iter(open(bson_file_path, "rb"))
48
+ num_written_rows = 0
49
+
50
+ with open(write_path, "wb") as f:
51
+ for record in tqdm(data, desc="Filtering BSON data"):
52
+ if record["_id"] in ids:
53
+ for col in exclude_cols:
54
+ if col in record:
55
+ del record[col]
56
+ num_written_rows += 1
57
+ f.write(bson.BSON.encode(record))
58
+
59
+ if num_written_rows % chunk_size == 0:
60
+ f.flush()
61
+
62
+ if max_rows is not None and num_written_rows >= max_rows:
63
+ break
64
+
65
+ # Create train, test from train split. Original train.bson contains 7,069,896 rows. Original test.bson contains 1,768,182 rows.
66
+ old_train = read_ids_and_category_ids(raw / "train.bson")
67
+
68
+ # Ensure rows in train_example remain in new_train
69
+ new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
70
+ answers = answers.sort_values(by="_id")
71
+
72
+ # Create sample submission
73
+ sample_submission = answers[["_id"]]
74
+ sample_submission["category_id"] = 1000010653
75
+
76
+ # Checks
77
+ assert len(new_train) + len(answers) == len(
78
+ old_train
79
+ ), f"The length of new_train and answers combined should be equal to the original length of old_train. Got {len(new_train) + len(answers)} and {len(old_train)}"
80
+ assert set(new_train["_id"]).isdisjoint(
81
+ set(answers["_id"])
82
+ ), "new_train and answers should not have any _ids in common"
83
+ assert sample_submission.columns.tolist() == [
84
+ "_id",
85
+ "category_id",
86
+ ], f"sample_submission should have columns _id and category_id. Got {sample_submission.columns.tolist()}"
87
+
88
+ # Write new files
89
+ answers.to_csv(private / "answers.csv", index=False)
90
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
91
+
92
+ filter_bson_by_ids(
93
+ bson_file_path=(
94
+ raw / "train_example.bson" if dev_mode else raw / "train.bson"
95
+ ), # train_example.bson is the first 100 rows of train.bson
96
+ ids=set(new_train["_id"]),
97
+ write_path=public / "train.bson",
98
+ )
99
+ filter_bson_by_ids(
100
+ bson_file_path=raw / "train_example.bson" if dev_mode else raw / "train.bson",
101
+ ids=set(answers["_id"]),
102
+ write_path=public / "test.bson",
103
+ exclude_cols=["category_id"], # removes category_id for test.bson
104
+ )
105
+
106
+ # Write new train_example.bson which is the first 100 rows of the new train.bson
107
+ filter_bson_by_ids(
108
+ bson_file_path=(
109
+ raw / "train_example.bson" if dev_mode else raw / "train.bson"
110
+ ), # train_example.bson is the first 100 rows of train.bson
111
+ ids=set(new_train["_id"]),
112
+ write_path=public / "train_example.bson",
113
+ max_rows=100,
114
+ )
115
+
116
+ def is_valid_bson_file(file_path: Path, chunk_size: int = 10000):
117
+ try:
118
+ with open(file_path, "rb") as f:
119
+ data_iter = bson.decode_file_iter(f)
120
+ for chunk in tqdm(
121
+ iter(lambda: list(islice(data_iter, chunk_size)), []),
122
+ desc=f"Validating {file_path.name}",
123
+ ):
124
+ pd.DataFrame(chunk) # Attempt to create a DataFrame from the chunk
125
+ except Exception as e:
126
+ return False
127
+
128
+ return True
129
+
130
+ # Check train.bson
131
+ assert is_valid_bson_file(public / "train.bson"), f"Couldn't parse `train.bson` as a bson file!"
132
+
133
+ # Check test.bson
134
+ assert is_valid_bson_file(public / "test.bson"), f"Couldn't parse `test.bson` as a bson file!"
135
+
136
+ # Copy over other files
137
+ shutil.copy(raw / "category_names.csv", public / "category_names.csv")
138
+
139
+ actual_new_train = read_ids_and_category_ids(public / "train.bson")
140
+ actual_new_train_example = read_ids_and_category_ids(public / "train_example.bson")
141
+
142
+ assert actual_new_train.iloc[:100].equals(
143
+ actual_new_train_example
144
+ ), f"The first 100 rows of `train.bson` should be the same as `train_example.bson`"
@@ -0,0 +1,205 @@
1
+ import shutil
2
+ from itertools import islice
3
+ from pathlib import Path
4
+
5
+ import bson
6
+ import pandas as pd
7
+ from sklearn.model_selection import train_test_split
8
+ from tqdm import tqdm
9
+
10
+
11
+ def prepare(raw: Path, public: Path, private: Path):
12
+ """
13
+ Splits the data in raw into public and private datasets with appropriate test/train splits.
14
+ Also creates a secondary validation split in public_val/private_val directories.
15
+ """
16
+
17
+ dev_mode = False
18
+
19
+ def read_ids_and_category_ids(file_path: Path) -> pd.DataFrame:
20
+ data = bson.decode_file_iter(open(file_path, "rb"))
21
+
22
+ records = []
23
+
24
+ for c, d in enumerate(tqdm(data, desc="Reading BSON data")):
25
+ records.append({"_id": d["_id"], "category_id": d["category_id"]})
26
+
27
+ return pd.DataFrame(records)
28
+
29
+ def filter_bson_by_ids(
30
+ bson_file_path: Path,
31
+ ids: set,
32
+ write_path: Path,
33
+ exclude_cols: list = [],
34
+ chunk_size=1000,
35
+ max_rows=None,
36
+ ):
37
+ """
38
+ Filters a BSON file by a set of IDs and writes the filtered data to a new BSON file.
39
+ The original _id is replaced with a new _id starting from 0 and incrementing by 1.
40
+
41
+ Args:
42
+ bson_file_path (Path): Path to the input BSON file.
43
+ ids (set): Set of IDs to filter by.
44
+ write_path (Path): Path to the output BSON file.
45
+ exclude_cols (list): List of columns to exclude from the output.
46
+ max_rows (int, optional): Maximum number of rows to write to the output file.
47
+ """
48
+ data = bson.decode_file_iter(open(bson_file_path, "rb"))
49
+ num_written_rows = 0
50
+
51
+ with open(write_path, "wb") as f:
52
+ for record in tqdm(data, desc=f"Filtering BSON data for {write_path.name}"):
53
+ if record["_id"] in ids:
54
+ for col in exclude_cols:
55
+ if col in record:
56
+ del record[col]
57
+ num_written_rows += 1
58
+ f.write(bson.BSON.encode(record))
59
+
60
+ if num_written_rows % chunk_size == 0:
61
+ f.flush()
62
+
63
+ if max_rows is not None and num_written_rows >= max_rows:
64
+ break
65
+
66
+ def is_valid_bson_file(file_path: Path, chunk_size: int = 10000):
67
+ try:
68
+ with open(file_path, "rb") as f:
69
+ data_iter = bson.decode_file_iter(f)
70
+ for chunk in tqdm(
71
+ iter(lambda: list(islice(data_iter, chunk_size)), []),
72
+ desc=f"Validating {file_path.name}",
73
+ ):
74
+ pd.DataFrame(chunk) # Attempt to create a DataFrame from the chunk
75
+ except Exception as e:
76
+ print(f"BSON validation failed for {file_path}: {e}")
77
+ return False
78
+
79
+ return True
80
+
81
+ def _process_and_write_split(
82
+ train_df: pd.DataFrame,
83
+ test_df: pd.DataFrame,
84
+ target_public_path: Path,
85
+ target_private_path: Path,
86
+ ):
87
+ """
88
+ Helper function to process a given train/test split and write all associated files
89
+ to the specified public and private directories.
90
+ """
91
+ # Create output directories
92
+ target_public_path.mkdir(exist_ok=True)
93
+ target_private_path.mkdir(exist_ok=True)
94
+
95
+ # Sort test dataframe for consistency
96
+ answers = test_df.sort_values(by="_id")
97
+
98
+ # Create sample submission
99
+ sample_submission = answers[["_id"]]
100
+ sample_submission["category_id"] = 1000010653
101
+
102
+ # Basic integrity checks
103
+ assert set(train_df["_id"]).isdisjoint(
104
+ set(answers["_id"])
105
+ ), "Train and test sets should not have any _ids in common"
106
+ assert sample_submission.columns.tolist() == [
107
+ "_id",
108
+ "category_id",
109
+ ], f"sample_submission should have columns _id and category_id. Got {sample_submission.columns.tolist()}"
110
+
111
+ # Write new files
112
+ answers.to_csv(target_private_path / "answers.csv", index=False)
113
+ sample_submission.to_csv(target_public_path / "sample_submission.csv", index=False)
114
+
115
+ # Determine raw data source based on dev_mode
116
+ raw_bson_source = raw / "train_example.bson" if dev_mode else raw / "train.bson"
117
+
118
+ filter_bson_by_ids(
119
+ bson_file_path=raw_bson_source,
120
+ ids=set(train_df["_id"]),
121
+ write_path=target_public_path / "train.bson",
122
+ )
123
+ filter_bson_by_ids(
124
+ bson_file_path=raw_bson_source,
125
+ ids=set(answers["_id"]),
126
+ write_path=target_public_path / "test.bson",
127
+ exclude_cols=["category_id"],
128
+ )
129
+ filter_bson_by_ids(
130
+ bson_file_path=raw_bson_source,
131
+ ids=set(train_df["_id"]),
132
+ write_path=target_public_path / "train_example.bson",
133
+ max_rows=100,
134
+ )
135
+
136
+ # Validate generated BSON files
137
+ assert is_valid_bson_file(target_public_path / "train.bson")
138
+ assert is_valid_bson_file(target_public_path / "test.bson")
139
+
140
+ # Copy over other files
141
+ shutil.copy(raw / "category_names.csv", target_public_path / "category_names.csv")
142
+
143
+ # Final check on train_example.bson content
144
+ actual_new_train = read_ids_and_category_ids(target_public_path / "train.bson")
145
+ actual_new_train_example = read_ids_and_category_ids(target_public_path / "train_example.bson")
146
+
147
+ assert actual_new_train.iloc[:100].equals(
148
+ actual_new_train_example
149
+ ), f"The first 100 rows of `train.bson` should be the same as `train_example.bson` in {target_public_path}"
150
+
151
+
152
+ # --- Main Script Logic ---
153
+
154
+ # Read the complete dataset IDs and categories
155
+ # Original train.bson contains 7,069,896 rows. Original test.bson contains 1,768,182 rows.
156
+ old_train = read_ids_and_category_ids(raw / "train.bson")
157
+
158
+ # === 1. Original Split: (train -> new_train + test) ===
159
+ # This split creates the primary competition data in `public` and `private`.
160
+ # This block is functionally identical to the original script to ensure outputs do not change.
161
+ print("--- Processing Original Split (public/private) ---")
162
+ new_train, answers = train_test_split(old_train, test_size=0.1, random_state=0)
163
+
164
+ assert len(new_train) + len(answers) == len(
165
+ old_train
166
+ ), f"The length of new_train and answers combined should be equal to the original length of old_train. Got {len(new_train) + len(answers)} and {len(old_train)}"
167
+
168
+ _process_and_write_split(
169
+ train_df=new_train,
170
+ test_df=answers,
171
+ target_public_path=public,
172
+ target_private_path=private,
173
+ )
174
+ print("--- Original Split processing complete. ---")
175
+
176
+
177
+ # === 2. New Validation Split: (new_train -> train_val + test_val) ===
178
+ # This second split takes the `new_train` set from above and splits it again.
179
+ # The outputs are saved to new, parallel directories `public_val` and `private_val`.
180
+ print("\n--- Processing Validation Split (public_val/private_val) ---")
181
+
182
+ # Define paths for the new validation set
183
+ public_val = public.parent / "public_val"
184
+ private_val = private.parent / "private_val"
185
+
186
+ # To make the new `test_val` set have a similar size to the original `test` set (10% of total),
187
+ # we must take a fraction of `new_train`. Since `new_train` is 90% of the total,
188
+ # we take 1/9 of it to get our new test set. (1/9) * 0.9 = 0.1
189
+ test_val_size_fraction = 1 / 9.0
190
+
191
+ train_val, answers_val = train_test_split(
192
+ new_train, test_size=test_val_size_fraction, random_state=0
193
+ )
194
+
195
+ assert len(train_val) + len(answers_val) == len(
196
+ new_train
197
+ ), "The validation split did not partition the new_train set correctly."
198
+
199
+ _process_and_write_split(
200
+ train_df=train_val,
201
+ test_df=answers_val,
202
+ target_public_path=public_val,
203
+ target_private_path=private_val,
204
+ )
205
+ print("--- Validation Split processing complete. ---")
@@ -0,0 +1,67 @@
1
+ from pandas import DataFrame
2
+ from scipy.sparse import spmatrix
3
+ from sklearn.metrics import jaccard_score
4
+ from sklearn.preprocessing import MultiLabelBinarizer
5
+
6
+ from mlebench.grade_helpers import InvalidSubmissionError
7
+
8
+
9
+ def binarize_sentences(y_true: list[set[str]], y_pred: list[set[str]]) -> tuple[spmatrix, spmatrix]:
10
+ """
11
+ Binarizes lists of sentence sets so we can pass them as inputs to scipy.jaccard_score
12
+ """
13
+ all_words = list(set.union(*y_true, *y_pred))
14
+
15
+ mlb = MultiLabelBinarizer(classes=all_words, sparse_output=True)
16
+ y_true_bin = mlb.fit_transform(y_true)
17
+ y_pred_bin = mlb.transform(y_pred)
18
+
19
+ return y_true_bin, y_pred_bin
20
+
21
+
22
+ def prepare_for_grading(
23
+ submission: DataFrame, answers: DataFrame, target_column: str
24
+ ) -> dict[str, spmatrix]:
25
+ """
26
+ Extracts the sentences from the dataframe
27
+ Splits them into sets of lowercased words
28
+ Binarizes these sets so we can pass them as inputs to scipy.jaccard_score
29
+ """
30
+
31
+ if len(submission) != len(answers):
32
+ raise InvalidSubmissionError("Submission should be the same length as the answers")
33
+ if target_column not in submission.columns:
34
+ raise InvalidSubmissionError(f"Submission must have a `{target_column}` column")
35
+ if "id" not in submission.columns:
36
+ raise InvalidSubmissionError("Submission must have an `id` column")
37
+ assert target_column in answers.columns, f"Answers must have a `{target_column}` column"
38
+ assert "id" in answers.columns, "Answers must have an `id` column"
39
+
40
+ submission = submission.sort_values("id")
41
+ answers = answers.sort_values("id")
42
+
43
+ if (submission["id"].values != answers["id"].values).any():
44
+ raise InvalidSubmissionError("Submission does not contain same IDs as test")
45
+
46
+ # pandas reads empty cells as NaNs, which are float. We fill with empty string to match type
47
+ submission[target_column] = submission[target_column].fillna("")
48
+ answers[target_column] = answers[target_column].fillna("")
49
+
50
+ y_pred = submission[target_column].tolist()
51
+ y_true = answers[target_column].tolist()
52
+
53
+ # split into sets of words
54
+ y_pred_sentence_sets = [set(sentence.lower().split()) for sentence in y_pred]
55
+ y_true_sentence_sets = [set(sentence.lower().split()) for sentence in y_true]
56
+
57
+ # binarize for jaccard_score
58
+ y_true_bin, y_pred_bin = binarize_sentences(y_true_sentence_sets, y_pred_sentence_sets)
59
+
60
+ return {"y_pred": y_pred_bin, "y_true": y_true_bin}
61
+
62
+
63
+ def grade(submission: DataFrame, answers: DataFrame) -> float:
64
+ grading_input = prepare_for_grading(submission, answers, "PredictionString")
65
+
66
+ # we use average=`samples` to match what the competition description suggests
67
+ return jaccard_score(**grading_input, average="samples")
@@ -0,0 +1,31 @@
1
+ from pathlib import Path
2
+
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ from mlebench.utils import read_csv
6
+
7
+
8
+ def prepare(raw: Path, public: Path, private: Path):
9
+
10
+ # Create train, test from train split
11
+ old_train = read_csv(raw / "train.csv")
12
+ new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
13
+ new_test_without_labels = new_test.drop(columns=["answer_start", "answer_text"])
14
+
15
+ # make private test match submission format
16
+ new_test = new_test[["id", "answer_text"]]
17
+ new_test.columns = ["id", "PredictionString"]
18
+
19
+ # Copy over files
20
+ new_train.to_csv(public / "train.csv", index=False)
21
+ new_test_without_labels.to_csv(public / "test.csv", index=False)
22
+ new_test.to_csv(private / "test.csv", index=False)
23
+
24
+ # Create sample submission
25
+ sample_submission = new_test.copy()
26
+ sample_submission["PredictionString"] = "dummy text"
27
+ sample_submission.to_csv(public / "sample_submission.csv", index=False)
28
+
29
+ assert len(sample_submission) == len(
30
+ new_test
31
+ ), "Sample submission length does not match test length."